1 Fig2

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(RColorBrewer)
# library(egg)
## eval 代码要不要运行
## echo 代码要不要输出
## include 图要不要输出
## warning 警告要不要输出
## message 默认Bin等信息要不要输出

1.1 dftotal and dftotal_withAncestral

dftotal <- read_tsv("data/005_SNPAnnoDB/geneSNPAnno.txt.gz") %>%
  mutate(Sub=str_sub(Transcript, 9, 9))
## Rows: 2672838 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (21): ID, Ref, Alt, Major, Minor, Transcript, Ancestral, Region, Variant...
## dbl (20): Chr, Pos, Maf, DAF, Gerp, Alt_SIFT, Ref_SIFT, Derived_SIFT, Gerp_1...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  # filter(!is.na(Ancestral),Ancestral==Ref|Ancestral==Alt) %>%
  # mutate(IfRefisAnc = if_else(Ref == Ancestral,"Anc","Der")) %>% 
  # mutate(Sub=str_sub(Transcript, 9, 9)) %>% 

dftotal_ancestral <- dftotal %>% 
  filter(!is.na(Ancestral),Ancestral == Ref | Ancestral == Alt) 

### 现将基因组过滤,只保留有 ancestral 状态的,并且添加LoadGroup、IfRefisAnc、CommonOrRare 等分组信息
dftotal_withAncestral <- dftotal %>% 
  filter(Region=="CDS",Variant_type=="SYNONYMOUS"|Variant_type=="NONSYNONYMOUS")%>%
  filter(!is.na(Ancestral),Ancestral == Ref | Ancestral == Alt) %>%
  mutate(Gerp_16way=ifelse(is.na(Gerp_16way),-100,Gerp_16way)) %>%
  mutate(Derived_PolyPhen2_prob=ifelse(is.na(Derived_PolyPhen2_prob),-100,Derived_PolyPhen2_prob)) %>%
  mutate(LoadGroup=ifelse(Derived_PolyPhen2_prob>=0.5 & Variant_type %in% c("NONSYNONYMOUS") & Gerp_16way >=1.5, "Deleterious", ifelse(Variant_type=="SYNONYMOUS", "Synonymous", ifelse(Variant_type=="NONSYNONYMOUS", "Nonsynonymous", NA)))) %>% 
  mutate(IfRefisAnc = if_else(Ref == Ancestral,"Anc","Der")) %>% 
  mutate(CommonOrRare = if_else(Maf <= 0.05, "Rare","Common")) %>% 
  filter(!is.na(LoadGroup))
### 特别注意,这里根据MAF进行分组,按照维基百科的定义
### 2个filter 5个mutate 1个filter


df <- dftotal %>% 
  filter(Region=="CDS",Variant_type=="SYNONYMOUS"|Variant_type=="NONSYNONYMOUS")%>%
  filter(!is.na(Ancestral),Ancestral == Ref | Ancestral == Alt) %>%
  mutate(Gerp_16way=ifelse(is.na(Gerp_16way),-100,Gerp_16way)) %>%
  mutate(Derived_PolyPhen2_prob=ifelse(is.na(Derived_PolyPhen2_prob),-100,Derived_PolyPhen2_prob)) %>%
  mutate(LoadGroup=ifelse(Derived_PolyPhen2_prob>=0.5 & Variant_type %in% c("NONSYNONYMOUS") & Gerp_16way >=1.5, "Deleterious", ifelse(Variant_type=="SYNONYMOUS", "Synonymous", ifelse(Variant_type=="NONSYNONYMOUS", "Nonsynonymous", NA)))) %>% 
  mutate(IfRefisAnc = if_else(Ref == Ancestral,"Anc","Der")) %>% 
  mutate(CommonOrRare = if_else(Maf <= 0.05, "Rare","Common")) %>% 
  filter(!is.na(LoadGroup)) %>% 
  filter(Ref_PolyPhen2_prob==Alt_PolyPhen2_prob)

1.1.0.1 =====================

1.2 VariantsType count all region

### 需要手动选择
factor1 <- "Region"
factor2 <- "Variant_type"
factor3 <- "Effect_VEP"
factor4 <- "Effect_snpEff"

df_variantsType1 <- dftotal %>% group_by(Region) %>% 
  # filter(!is.na(Region)) %>% 
  summarise(n=n()) %>% 
  rename(Type=Region)

df_variantsType2 <- dftotal %>% group_by(Variant_type) %>% 
  filter(!is.na(Variant_type)) %>% 
  summarise(n=n()) %>% 
  rename(Type=Variant_type)

df_variantsType3 <- dftotal %>% group_by(Effect_VEP) %>% 
  filter(!is.na(Effect_VEP)) %>% 
  summarise(n=n()) %>% 
  rename(Type=Effect_VEP)

print("Effect_VEP class:")
## [1] "Effect_VEP class:"
df_variantsType3$Type
##  [1] "3_prime_UTR_variant"                                      
##  [2] "5_prime_UTR_variant"                                      
##  [3] "coding_sequence_variant"                                  
##  [4] "incomplete_terminal_codon_variant,coding_sequence_variant"
##  [5] "intron_variant"                                           
##  [6] "missense_variant"                                         
##  [7] "missense_variant,splice_region_variant"                   
##  [8] "splice_acceptor_variant"                                  
##  [9] "splice_acceptor_variant,missense_variant"                 
## [10] "splice_acceptor_variant,synonymous_variant"               
## [11] "splice_donor_variant"                                     
## [12] "splice_donor_variant,missense_variant"                    
## [13] "splice_donor_variant,stop_gained"                         
## [14] "splice_region_variant,3_prime_UTR_variant"                
## [15] "splice_region_variant,5_prime_UTR_variant"                
## [16] "splice_region_variant,coding_sequence_variant"            
## [17] "splice_region_variant,intron_variant"                     
## [18] "splice_region_variant,stop_retained_variant"              
## [19] "splice_region_variant,synonymous_variant"                 
## [20] "start_lost"                                               
## [21] "start_lost,splice_region_variant"                         
## [22] "stop_gained"                                              
## [23] "stop_gained,splice_region_variant"                        
## [24] "stop_lost"                                                
## [25] "stop_lost,splice_region_variant"                          
## [26] "stop_retained_variant"                                    
## [27] "synonymous_variant"
df_variantsType4 <- dftotal %>% group_by(Effect_snpEff) %>% 
  filter(!is.na(Effect_snpEff)) %>% 
  summarise(n=n()) %>% 
  rename(Type=Effect_snpEff)

print("=======================")
## [1] "======================="
print("Effect_SnpEff class:")
## [1] "Effect_SnpEff class:"
df_variantsType4$Type
##  [1] "3_prime_UTR_variant"                                         
##  [2] "5_prime_UTR_premature_start_codon_gain_variant"              
##  [3] "5_prime_UTR_variant"                                         
##  [4] "initiator_codon_variant"                                     
##  [5] "initiator_codon_variant&splice_region_variant"               
##  [6] "intron_variant"                                              
##  [7] "missense_variant"                                            
##  [8] "missense_variant&splice_region_variant"                      
##  [9] "splice_acceptor_variant&intron_variant"                      
## [10] "splice_acceptor_variant&splice_donor_variant&intron_variant" 
## [11] "splice_acceptor_variant&splice_region_variant&intron_variant"
## [12] "splice_donor_variant&intron_variant"                         
## [13] "splice_donor_variant&splice_region_variant&intron_variant"   
## [14] "splice_region_variant"                                       
## [15] "splice_region_variant&intron_variant"                        
## [16] "splice_region_variant&stop_retained_variant"                 
## [17] "splice_region_variant&synonymous_variant"                    
## [18] "start_lost"                                                  
## [19] "start_lost&splice_region_variant"                            
## [20] "stop_gained"                                                 
## [21] "stop_gained&splice_region_variant"                           
## [22] "stop_lost"                                                   
## [23] "stop_lost&splice_region_variant"                             
## [24] "stop_retained_variant"                                       
## [25] "synonymous_variant"
colB <- brewer.pal(9,"Blues")[c(2,3,4,5,6,7)]

p <- df_variantsType1 %>% 
ggplot(aes(x=Type,y=n,fill=Type))+
  geom_bar(stat = 'identity',position = "stack")+
  geom_text(aes(label=n),size=3,nudge_y = 1)+
  labs(x="", y="SNP count")+
  scale_fill_brewer(palette = "Set2")+
  # scale_fill_manual(values = colB)+
  # scale_fill_brewer(palette = "Blues")+
  theme_classic()+
  theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
  theme(legend.position = "none",plot.margin=unit(rep(1,4),'lines'),
          axis.line = element_line(size=0.4, colour = "black"),text = element_text(size = 10))

p

p <- df_variantsType2 %>% 
ggplot(aes(x=Type,y=n,fill=Type))+
  geom_bar(stat = 'identity',position = "stack")+
  geom_text(aes(label=n),size=3,nudge_y = 1)+
  labs(x="", y="SNP count")+
  scale_fill_brewer(palette = "Set2")+
  # scale_fill_manual(values = colB)+
  # scale_fill_brewer(palette = "Blues")+
  theme_classic()+
  theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
  theme(legend.position = "none",plot.margin=unit(rep(1,4),'lines'),
          axis.line = element_line(size=0.4, colour = "black"),text = element_text(size = 10))

p

1.2.0.1 ——————–

1.3 **Fig2:variants count all region

### "Intron" "UTR_3"  "UTR_5" 
df1 <- dftotal %>% group_by(Region) %>% 
  # filter(!is.na(Region)) %>% 
  summarise(n=n()) %>% 
  rename(Type=Region) %>% 
  filter(!Type=="CDS")

### "NONSYNONYMOUS" "STOP-GAIN"     "SYNONYMOUS" 
df2 <- dftotal %>% group_by(Variant_type) %>% 
  filter(!is.na(Variant_type)) %>% 
  summarise(n=n()) %>% 
  rename(Type=Variant_type) %>% 
  filter(!Type %in%  c("NONCODING","START-LOST","STOP-LOSS")) 

# expression(paste("Length-Freq of", italic("E. coruscans"), "by Gear")) 

### "Essential splice"
### 主要发生在内含子区域
df3 <- dftotal %>% 
  filter(Impact_VEP=="HIGH") %>%
  filter(str_detect(Effect_VEP,"splice_")) %>% 
  count() %>% 
  mutate(Type="Essential splice") %>% 
  relocate(Type,.before = n)

### Intergenic
df4 <- tibble(Type="Intergenic",n=193285480)
  
df <- rbind(df1,df2,df3,df4)

factorA <- c("Intergenic","Intron","UTR_5","UTR_3","SYNONYMOUS","NONSYNONYMOUS","Essential splice","STOP-GAIN")
labelsA <- c("Intergenic","Intron","UTR_5","UTR_3","Synonymous","Nonsynonymous","Essential splice","Nonsense")
colB <- c(rep("gray70",4),"#004680","#e69f00","darkred","darkred")

colB <- rev(colB)

df$Type <- factor(df$Type,levels = rev(factorA),labels = rev(labelsA))

1.3.1 *plot

library(ggbreak)
## ggbreak v0.0.7
## 
## If you use ggbreak in published research, please cite the following paper:
## 
## S Xu, M Chen, T Feng, L Zhan, L Zhou, G Yu. Use ggbreak to effectively utilize plotting space to deal with large datasets and outliers. Frontiers in Genetics. 2021, 12:774846. doi: 10.3389/fgene.2021.774846
p <- df %>% 
  ggplot(aes(x= Type,y=n,fill=Type))+
  geom_bar(stat = 'identity',position = "stack")+
  # geom_text(aes(label=n),size=3,nudge_y = 1)+
  geom_hline(yintercept =c(400000,1400000), linetype="dashed", color = "gray")+
  geom_hline(yintercept =c(2000000,190000000), linetype="dashed", color = "gray")+
  scale_fill_manual(values = colB)+
  scale_y_break(c(400000, 1400000), scales = 0.5)+
  scale_y_break(c(2000000, 190000000), scales = 0.5)+
  scale_y_continuous(labels = scales::comma_format(big.mark = ','))+
  labs(x="",y="SNP count")+
  theme_classic()+
  coord_flip()+
  theme(axis.text.x= element_text(angle=90, vjust=0,hjust = 0))+
  theme(legend.position = "none",plot.margin=unit(rep(1,4),'lines'),
          axis.line = element_line(size=0.4, colour = "black"),text = element_text(size = 12))


p

# ggsave(p,filename = "~/Documents/test.pdf", width = 4, height = 5)
ggsave(p,filename = "~/Documents/snpNum.pdf", width = 3, height = 6)

# ggsave(p,filename = "~/Documents/test.png", width = 4, height = 5)

1.3.1.1 ——————–

1.4 GERP distribution

gerpDir <- "data/006_Fig2/gerp/001_gerp_subset"

ChrRef <-rep(str_c(rep(1:7,each=3),rep(c("A","B","D"),7)),each=2)  
df_chrConvertion <- tibble(Chr=c(1:42),ChrRef=ChrRef,Sub=str_sub(ChrRef,2))

df_intergenic <- dir(gerpDir, full.names = T, pattern = ".txt.gz") %>% 
  map(read_tsv) %>% 
  bind_rows() %>% 
  filter(GeneFuture=="INTERGENIC") %>% 
  rename(Chr=ChrID) %>% 
  left_join(.,df_chrConvertion, by = "Chr")
## Rows: 89553 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 33318 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 84667 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 53088 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 120679 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 15824 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 90215 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 70257 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 93526 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 72871 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 120052 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 54897 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 84013 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 67633 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 86700 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 79989 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 124884 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 42382 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 75813 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 71928 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 81987 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 43772 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 109061 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 18522 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 87199 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 62240 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 92576 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 61571 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 119615 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 37557 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 83318 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 38341 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 93946 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 61687 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 115395 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 9273 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 113382 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 75263 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 98529 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 70249 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 120984 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 52656 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): GeneFuture
## dbl (3): ChrID, Pos, GERP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_region <- dftotal %>% 
  filter(Region %in%  c("Intron","UTR_5","UTR_3")) %>%
  select(Chr,Pos,GERP=Gerp_16way,GeneFuture=Region) %>% 
  left_join(.,df_chrConvertion, by = "Chr") 

df_variantsType <- dftotal %>% 
  filter(Variant_type %in%  c("NONSYNONYMOUS","SYNONYMOUS","STOP-GAIN")) %>%
  select(Chr,Pos,GERP=Gerp_16way,GeneFuture=Variant_type) %>% 
  left_join(.,df_chrConvertion, by = "Chr") 

df_splice <- dftotal %>% 
  filter(Impact_VEP=="HIGH") %>%
  filter(str_detect(Effect_VEP,"splice_")) %>% 
  select(Chr,Pos,GERP=Gerp_16way) %>% 
  mutate(GeneFuture="Essential splice") %>% 
  left_join(.,df_chrConvertion, by = "Chr") 

df <- rbind(df_intergenic,df_region,df_variantsType,df_splice)

factorA <- c("INTERGENIC","Intron","UTR_5","UTR_3","SYNONYMOUS","NONSYNONYMOUS","Essential splice","STOP-GAIN")
labelsA <- c("Intergenic","Intron","UTR_5","UTR_3","Synonymous","Nonsynonymous","Essential splice","Nonsense")
colB <- c(rep("gray70",4),"#004680","#e69f00","darkred","darkred")

df$GeneFuture <- factor(df$GeneFuture,levels = factorA,labels = labelsA)

dfgerpdis <- df %>% 
    slice_sample(prop = 0.3) 

1.4.1 plot

### histogram
p <- dfgerpdis %>%
  group_by(GeneFuture) %>% 
  ggplot(aes(x=GERP,fill=GeneFuture))+
  geom_histogram(aes(y=..ndensity..), alpha=0.95)+
  facet_grid(GeneFuture~., scales = "free")+
  # facet_grid(.~GeneFuture, scales = "free")+
  labs(x="GERP score",y="Density")+
  scale_y_continuous(breaks = seq(0,1,0.5))+
  scale_fill_manual(values = colB)+
  scale_color_manual(values = colB)+
  theme(
    strip.background = element_blank(),
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7),
    legend.position = 'none',
    text = element_text(size = 12))

p
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 101324 rows containing non-finite values (stat_bin).

ggsave(plot = p, "~/Documents/test.pdf", width = 3, height = 5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 101324 rows containing non-finite values (stat_bin).

1.4.1.1 ——————–

1.5 Deleterious count

1.5.1 1.区分亚基因组

### GERP & PPH2 方法定义有害突变
df <- dftotal %>% 
  mutate(Sub=str_sub(Transcript, 9, 9)) %>% 
  filter(!is.na(Ancestral),Ancestral==Ref|Ancestral==Alt) %>% 
  # filter(!is.na(DAF)) %>%
  mutate(Gerp_16way=ifelse(is.na(Gerp_16way),-100,Gerp_16way)) %>%
  mutate(Derived_PolyPhen2_prob=ifelse(is.na(Derived_PolyPhen2_prob),-100,Derived_PolyPhen2_prob)) %>%
  mutate(LoadGroup=ifelse(Derived_PolyPhen2_prob>=0.5 & Variant_type %in% c("NONSYNONYMOUS") & Gerp_16way >=1.5, "Deleterious", ifelse(Variant_type=="SYNONYMOUS", "Synonymous", ifelse(Variant_type=="NONSYNONYMOUS", "Nonsynonymous", NA)))) %>% 
  filter(!is.na(LoadGroup)) %>%
  group_by(Sub) %>% 
  count(LoadGroup) %>% 
  mutate(fre=n/sum(n)) %>% 
  arrange(Sub,desc(LoadGroup))

### GERP 方法定义有害突变
df2 <- dftotal %>% 
  mutate(Sub=str_sub(Transcript, 9, 9)) %>% 
  filter(!is.na(Ancestral),Ancestral==Ref|Ancestral==Alt) %>%
  filter(!is.na(DAF)) %>%
  # mutate(Gerp_16way=ifelse(is.na(Gerp_16way),-100,Gerp_16way)) %>% 
  mutate(LoadGroup=ifelse(Variant_type %in% c("NONSYNONYMOUS") & Gerp_16way >=2.14, "Deleterious", ifelse(Variant_type=="SYNONYMOUS", "Synonymous", ifelse(Variant_type=="NONSYNONYMOUS", "Nonsynonymous", NA)))) %>% 
  filter(!is.na(LoadGroup)) %>% 
  group_by(Sub) %>% 
  count(LoadGroup) %>% 
  mutate(fre=n/sum(n)) %>% 
  arrange(Sub,desc(LoadGroup))


### *****************************plot***********************************************
colB <- c("#d5311c","#e69f00","#004680")
df$LoadGroup <- factor(df$LoadGroup,levels = c("Deleterious","Nonsynonymous","Synonymous"))

### 只画柱状图 dodge
p <- df %>%
ggplot(aes(x=Sub,y=n,fill=LoadGroup))+
  geom_bar(stat = 'identity',position = "dodge")+
  geom_text(aes(label=paste(n,"\n(",round(fre,2),")",sep = "")),size=3,
            position = position_dodge(0.9))+
  # geom_text(aes(label=paste(n,"\n(",round(fre,2),")",sep = "")),size=3,nudge_y = 1)+
  labs(x="", y="SNP count")+
  scale_fill_manual(values = colB)+
  theme_classic()+
  # theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
  theme(legend.position = "none",plot.margin=unit(rep(1,4),'lines'),
          axis.line = element_line(size=0.4, colour = "black"),text = element_text(size = 10))

p

### 只画柱状图 stack
p <- df %>% 
  ggplot(aes(x=Sub, y=n,fill=LoadGroup)) +
  geom_bar(stat="identity",alpha = 1)+  #,position=position_dodge()
  geom_text(aes(label=  paste(prettyNum(n,big.mark = ","),"(",round(fre,2),")")),position =  position_stack(),vjust=1.5, size=3) +
  scale_fill_manual(values=colB)+
  labs(y="SNP count",x="")+
  theme(panel.background = element_rect(size = 0.7, colour = 'black',fill = 'white'),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),text = element_text(size = 12)
        ,legend.position = 'none'
        )

p

### 只画柱状图 stack 百分比是1
p <- df %>% 
  ggplot(aes(x=Sub, y=n,fill=LoadGroup)) +
  geom_bar(stat="identity",alpha = 1,position = "fill")+  #,position=position_dodge()
  geom_text(aes(label=  paste(prettyNum(n,big.mark = ","),"(",round(fre,2),")")),position = position_fill(),vjust=1.5, size=3) +
  scale_fill_manual(values=colB)+
  labs(y="Proportion",x="")+
  theme(panel.background = element_rect(size = 0.7, colour = 'black',fill = 'white'),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),text = element_text(size = 12)
        ,legend.position = 'none'
        )

p

ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)

1.5.2 2.不区分亚基因组

### 不区分亚基因组
### GERP & PPH2 方法定义有害突变
df <- dftotal %>% 
  mutate(Sub=str_sub(Transcript, 9, 9)) %>% 
  filter(!is.na(Ancestral),Ancestral==Ref|Ancestral==Alt) %>%
  filter(!is.na(DAF)) %>%
  # mutate(Gerp_16way=ifelse(is.na(Gerp_16way),-100,Gerp_16way)) %>%
  # mutate(Derived_PolyPhen2_prob=ifelse(is.na(Derived_PolyPhen2_prob),-100,Derived_PolyPhen2_prob)) %>%
  mutate(LoadGroup=ifelse(Derived_PolyPhen2_prob>=0.5 & Variant_type %in% c("NONSYNONYMOUS") & Gerp_16way >=1.5, "Deleterious", ifelse(Variant_type=="SYNONYMOUS", "Synonymous", ifelse(Variant_type=="NONSYNONYMOUS", "Nonsynonymous", NA)))) %>% 
  filter(!is.na(LoadGroup)) %>% 
  # filter(Sub=="B") %>%
  count(LoadGroup) %>% 
  mutate(fre=n/sum(n)) %>% 
  arrange(desc(LoadGroup))


### GERP 方法定义有害突变
df2 <- dftotal %>% 
  mutate(Sub=str_sub(Transcript, 9, 9)) %>% 
  filter(!is.na(Ancestral),Ancestral==Ref|Ancestral==Alt) %>%
  filter(!is.na(DAF)) %>%
  # mutate(Gerp_16way=ifelse(is.na(Gerp_16way),-100,Gerp_16way)) %>% 
  mutate(LoadGroup=ifelse(Variant_type %in% c("NONSYNONYMOUS") & Gerp_16way >=2.14, "Deleterious", ifelse(Variant_type=="SYNONYMOUS", "Synonymous", ifelse(Variant_type=="NONSYNONYMOUS", "Nonsynonymous", NA)))) %>% 
  filter(!is.na(LoadGroup)) %>% 
  # filter(Sub=="B") %>% 
  count(LoadGroup) %>% 
  mutate(fre=n/sum(n)) %>% 
  arrange(desc(LoadGroup))


### *****************************plot***********************************************
colB <- c("#d5311c","#e69f00","#004680")
df$LoadGroup <- factor(df$LoadGroup,levels = c("Deleterious","Nonsynonymous","Synonymous"))

### 只画柱状图 dodge
p <- df %>%
ggplot(aes(x=LoadGroup,y=n,fill=LoadGroup))+
  geom_bar(stat = 'identity',position = "dodge")+
  geom_text(aes(label=paste(n,"\n(",round(fre,2),")",sep = "")),size=1,
            position = position_dodge(0.9))+
  # geom_text(aes(label=paste(n,"\n(",round(fre,2),")",sep = "")),size=3,nudge_y = 1)+
  labs(x="", y="SNP count")+
  scale_fill_manual(values = colB)+
  theme_classic()+
  theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
  theme(legend.position = "none",plot.margin=unit(rep(1,4),'lines'),
          axis.line = element_line(size=0.4, colour = "black"),text = element_text(size = 7))

p

ggsave(p,filename = "~/Documents/test.pdf",height = 1.725,width = 2.3)

1.5.2.1 ===== Gene level ================

1.6 韦恩图 gene common 和 gene rare 列表交集

### https://rdrr.io/cran/VennDiagram/man/venn.diagram.html
### 含有 common 基因的基因列表
dfcommon <- dftotal_withAncestral %>% 
  group_by(Transcript) %>% 
  count(CommonOrRare) %>% 
  filter(CommonOrRare == "Common") %>% 
  dplyr::select(Transcript)

### 含有 rare 基因的基因列表
dfrare <- dftotal_withAncestral %>% 
  group_by(Transcript) %>% 
  count(CommonOrRare) %>% 
  filter(CommonOrRare == "Rare") %>%
  dplyr::select(Transcript)

library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
#Make the plot
venn.diagram(
  x = list(dfcommon$Transcript,dfrare$Transcript), 
  main = "common and rare genes",
  category.names = c("common" , "rare" ),
  filename = "~/Documents/gene_statistic.png",
  output = T ,
  
  # Output features
  imagetype="png" ,
  # height = 480 , 
  # width = 480 , 
  # resolution = 300,
  # compression = "lzw",
  
  # Circles
  lwd = 3,
  col=c("#440154ff", '#21908dff'),
  fill = c(alpha("#440154ff",0.3), alpha('#21908dff',0.3)),
  
  # Numbers
  cex = 0.5,
  fontfamily = "sans",
  cat.cex = 0.3,
  cat.default.pos = "outer",
  ## cat.pos = c(-27, 27, 135),
  ## cat.dist = c(0.055, 0.055, 0.085),
  cat.pos = c(-27, 27),
  cat.dist = c(0.055, 0.055),
  cat.fontfamily = "sans",
  cat.col = c("#440154ff", '#21908dff')
  # rotation = 1
)
## [1] 1

1.7 Gene count summary

### 1.计算多少个基因含有突变 47490
genenum <- length(unique(dftotal_withAncestral$Transcript))
print(paste("Gene num with SNP: ",genenum))
## [1] "Gene num with SNP:  47490"
### 1.基因分类为 common rare commonAndRare

df_geneCategory <- dftotal_withAncestral %>% 
  dplyr::select(ID,Transcript,CommonOrRare) %>% 
  group_by(Transcript,CommonOrRare) %>% 
  count() %>% 
  pivot_wider(names_from = CommonOrRare,values_from = n,values_fill = 0) %>% 
  mutate(GeneGroup=ifelse(Common>0 & Rare >0 ,"WithCommonandRareSNPs",
                          ifelse(Common>0 & Rare ==0,"OnlyCommonSNPs",
                                 ifelse(Common==0 & Rare >0,"OnlyRareSNPs",NA))))

  
### 
df_geneCategory_num <- df_geneCategory %>% 
  group_by(GeneGroup) %>% count()

print(df_geneCategory_num)
## # A tibble: 3 × 2
## # Groups:   GeneGroup [3]
##   GeneGroup                 n
##   <chr>                 <int>
## 1 OnlyCommonSNPs         1397
## 2 OnlyRareSNPs          25731
## 3 WithCommonandRareSNPs 20362
p <- df_geneCategory_num %>% 
  ggplot(aes(x=GeneGroup, y=n,fill=GeneGroup)) +
  geom_bar(stat="identity",alpha = 1)+  #,position=position_dodge()
  geom_text(aes(label=prettyNum(n,big.mark = ",")),position =  position_stack(),vjust=1.5, size=3) +
  scale_fill_brewer(palette = "Set3")+
  labs(y="Gene count",x="")+
  # theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45))+
  theme(panel.background = element_rect(size = 0.7, colour = 'black',
        fill = 'white'),
        panel.grid = element_blank(),
        legend.position = 'none',
        text = element_text(size = 12))

p

ggsave(p,filename = "~/Documents/test1.pdf",height = 3,width = 4)


### ======================================
### 2.基因分类为 common-del/nodel  rare-del/nodel commonAndRare-del/nodel

df_geneCategory2 <- dftotal_withAncestral %>% 
  dplyr::select(ID,Transcript,CommonOrRare,LoadGroup) %>% 
  mutate(IfDel=ifelse(LoadGroup=="Deleterious","Del","NoDel")) %>%
  group_by(Transcript,IfDel) %>% 
  count() %>% 
  pivot_wider(names_from = IfDel,values_from = n,values_fill = 0) %>% 
  mutate(DelGroup=ifelse(Del>0 ,"HasDeleteriousSNP","NoDeleteriousSNP")) %>% 
  left_join(df_geneCategory,by="Transcript")

df_geneCategory2_num <- df_geneCategory2 %>% 
  group_by(GeneGroup,DelGroup) %>% 
  count()
 
print(df_geneCategory2_num)
## # A tibble: 6 × 3
## # Groups:   GeneGroup, DelGroup [6]
##   GeneGroup             DelGroup              n
##   <chr>                 <chr>             <int>
## 1 OnlyCommonSNPs        HasDeleteriousSNP   149
## 2 OnlyCommonSNPs        NoDeleteriousSNP   1248
## 3 OnlyRareSNPs          HasDeleteriousSNP  8681
## 4 OnlyRareSNPs          NoDeleteriousSNP  17050
## 5 WithCommonandRareSNPs HasDeleteriousSNP 11081
## 6 WithCommonandRareSNPs NoDeleteriousSNP   9281
### ================== 画图 ====================
colB <- c("#d5311c","gray") ###一个是有害突变,一个是没有有害突变
p <- df_geneCategory2_num %>% 
  ggplot(aes(x=GeneGroup, y=n,fill=DelGroup)) +
  geom_bar(stat="identity",alpha = 1)+  #,position=position_dodge()
  geom_text(aes(label=prettyNum(n,big.mark = ",")),position =  position_stack(),vjust=1.5, size=3) +
  scale_fill_manual(values=colB)+
  labs(y="Gene count",x="")+
  # theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45))+
  theme(panel.background = element_rect(size = 0.7, colour = 'black',fill = 'white'),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),text = element_text(size = 12)
        ,legend.position = 'none'
        )

p

ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)

1.8 Gene MAF LoadGroup

## 每个基因至少出现3次,即每个基因至少含有Synonymous Nonsynonymous Deleterious
## 过滤后,每个基因按照rare的总数进行排序,由低到高,并进行间隔抽样,抽取100个试试看

### 每个基因含有的rare变异的分布范围
df_common_rare <- dftotal_withAncestral %>%
  group_by(Transcript,CommonOrRare) %>% 
  count() %>% 
  ungroup() %>% 
  filter(CommonOrRare == "Rare") %>% 
  select(Transcript,RareVariantsCount=n)

print(str_c(nrow(df_common_rare)," genes which has rare SNPs."))
## [1] "46093 genes which has rare SNPs."
print(str_c("The range of rare SNPs is ",min(df_common_rare$n)," - ",max(df_common_rare$n)))
## Warning: Unknown or uninitialised column: `n`.
## Warning in min(df_common_rare$n): no non-missing arguments to min; returning Inf
## Warning: Unknown or uninitialised column: `n`.
## Warning in max(df_common_rare$n): no non-missing arguments to max; returning
## -Inf
## [1] "The range of rare SNPs is Inf - -Inf"
df_stage1 <- dftotal_withAncestral %>% 
  group_by(Transcript,CommonOrRare,LoadGroup) %>% 
  count(LoadGroup) %>% 
  mutate(Sub = str_sub(Transcript,9,9)) %>% 
  pivot_wider(names_from = CommonOrRare, values_from  = n,values_fill = 0)

df_filter <- df_stage1 %>% 
  group_by(Transcript) %>% 
  count() %>% 
  filter(n == 3) %>% 
  select(Transcript) %>% 
  left_join(.,df_common_rare,by="Transcript") %>% 
  filter(!is.na(RareVariantsCount)) %>% 
  ungroup()
  ### nuique 一下计数

df_sample <- df_filter %>% 
  distinct(RareVariantsCount,.keep_all=TRUE) %>% 
  arrange(RareVariantsCount)

df <- df_sample %>% 
  left_join(.,df_stage1,by="Transcript") %>% 
  rename(Gene = Transcript)

## 处理抽样基因的CDS长度
df_geneHC <- read_tsv("data/006_Fig2/wheat_v1.1_nonoverlap_addPos.txt") %>%
  select(Longest_transcript,CDSLength) %>% 
  rename(Transcript=Longest_transcript)
## Rows: 105200 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene, Longest_transcript
## dbl (9): Is_Overlapped_gene(1,0), Number_Overlapped_gene, Is_Unique_gene(1,0...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_geneCDSlength <- df_sample %>% 
  left_join(.,df_geneHC,by="Transcript") %>% 
  rename(Gene = Transcript)

1.8.1 plot

colB <- c("#d5311c","#e69f00","#004680")
p <- ggplot()+
  geom_bar(data = df, mapping = aes(x=reorder(Gene,RareVariantsCount) ,y=-Rare,fill=LoadGroup),stat = 'identity')+
  geom_bar(data = df, mapping = aes(x=Gene,y=Common,fill = LoadGroup),stat = 'identity')+
  # geom_segment(x=0,xend= 100000,y=0,yend=0,size=1,color='white')+
  geom_line(data = df_geneCDSlength,mapping = aes(x=Gene,y= 7*CDSLength/1000,group = 1),color = "gray")+
  annotate("text",x=-Inf,y= 0 ,hjust=-0.3,vjust=4,label="MAF <= 5%",size=3)+
  annotate("text",x=-Inf,y= 0,hjust=-0.3,vjust=-4,label="MAF > 5%",size=3)+
  scale_y_continuous(limit = c(-200,120), breaks = seq(-200,100,50),labels = c(200,150,100,50,0,50,100)
                     ,sec.axis = sec_axis(~./7, name="CDS length (kb)",breaks = seq(0,15,5))
                     )+
  xlab("Gene rank (by number of rare coding SNVs)")+ylab("Number of variants")+
  scale_fill_manual(values = colB,name="")+
  scale_x_discrete(limits = df_sample$Transcript, breaks = c("TraesCS1A02G169600.1","TraesCS1A02G040600.1","TraesCS1A02G091500.1","TraesCS1B02G160800.1")
                   ,labels = c(1,25,50,75)
                   )+
  # theme(axis.text.x = element_blank(),axis.ticks.x = element_blank(),axis.line.x = element_blank())+
  theme(panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7),
    legend.position = 'none',
    text = element_text(size = 9))
p

# ggsave(p,filename = "figs/Fig2/misc/Gene_MAF_LoadGroup.pdf",height = 3,width = 6)
ggsave(p,filename = "~/Documents/test.pdf",height = 2,width = 3.6)

1.8.2 稀有变异数分布

df_filter %>% 
  ggplot(aes(x=.$RareVariantsCount))+
  geom_histogram()
## Warning: Use of `.$RareVariantsCount` is discouraged. Use `RareVariantsCount`
## instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### 非同义突变数目分布

df_highImpact <- dftotal %>% 
  filter(Impact_VEP=="HIGH") %>% 
  mutate(LoadGroup="HighImpact")

df1 <- dftotal_withAncestral %>% 
  select(-IfRefisAnc,CommonOrRare) %>% 
  bind_rows(df_highImpact) %>% 
  # filter(LoadGroup=="Nonsynonymous") %>% 
  group_by(LoadGroup) %>% 
  count(Transcript)
  # mutate(bin=cut_width(n,width = 3,boundary = 0)) %>% 
  # group_by(LoadGroup) %>% 
  # count(bin) %>% 
  # mutate(fre=n/sum(n))

colB <- c("#d5311c","darkred","#e69f00","#004680")
p <- df1 %>% 
  ggplot(aes(x=n,fill=LoadGroup,color=LoadGroup))+
  # geom_histogram(aes(y=..ndensity..), alpha=0.5,bins = 15)+
  geom_histogram(alpha=0.7,bins = 15)+
  # geom_density(alpha=.2, fill="#FF6666",)+
  # ggplot(aes(x=bin,y=fre,group=LoadGroup,fill=LoadGroup))+
  # geom_bar(stat = 'identity',position = "dodge")+
  labs(x="Number of derived SNPs in one gene", y="Gene number")+
  scale_fill_manual(values = colB)+
  scale_color_manual(values = colB)+
  facet_wrap(.~LoadGroup, scales = "free")+
  theme_classic()+
  theme(panel.grid = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(size=0.3, colour = "black"),
    legend.position = 'none',
    text = element_text(size = 12))

p

ggsave(p,filename = "~/Documents/test.pdf",height = 4,width = 6)

数目统计

## 每个基因含有的变异数范围
df <- dftotal %>% filter(Region=="CDS",Variant_type=="SYNONYMOUS"|Variant_type=="NONSYNONYMOUS")%>% 
  mutate(Derived_SIFT = ifelse(is.na(Derived_SIFT),100,Derived_SIFT)) %>% 
  mutate(Gerp = ifelse(is.na(Gerp),-100,Gerp)) %>% 
  mutate(LoadGroup = ifelse(Variant_type=="NONSYNONYMOUS",ifelse(Derived_SIFT<0.05&Gerp>=1,"Deleterious","Nonsynonymous"),"Synonymous")) %>% 
  filter(!Ancestral == "NA") %>% 
  filter(Ancestral== Ref|Ancestral==Alt) %>% 
  mutate(IfRefisAnc = if_else(Ref == Ancestral,"Anc","Der")) %>% 
  mutate(CommonOrRare = if_else(Maf <= 0.05, "Rare","Common")) %>%  ### 特别注意,这里根据MAF进行分组,按照维基百科的定义
  count(Transcript) 

## 1-199 个变异

## 每个基因含有的common和rare变异的分布范围
df <- dftotal %>% filter(Region=="CDS",Variant_type=="SYNONYMOUS"|Variant_type=="NONSYNONYMOUS")%>% 
  mutate(Derived_SIFT = ifelse(is.na(Derived_SIFT),100,Derived_SIFT)) %>% 
  mutate(Gerp = ifelse(is.na(Gerp),-100,Gerp)) %>% 
  mutate(LoadGroup = ifelse(Variant_type=="NONSYNONYMOUS",ifelse(Derived_SIFT<0.05&Gerp>=1,"Deleterious","Nonsynonymous"),"Synonymous")) %>% 
  filter(!Ancestral == "NA") %>% 
  filter(Ancestral== Ref|Ancestral==Alt) %>% 
  mutate(IfRefisAnc = if_else(Ref == Ancestral,"Anc","Der")) %>% 
  mutate(CommonOrRare = if_else(Maf <= 0.05, "Rare","Common")) %>%  ### 特别注意,这里根据MAF进行分组,按照维基百科的定义
  group_by(CommonOrRare) %>% 
  count(Transcript) %>% 
  summarise(maxNum = max(n),minNum = min(n))



## 每个基因含有的不同loadGroup变异的分布范围
df <- dftotal %>% filter(Region=="CDS",Variant_type == "SYNONYMOUS"|Variant_type=="NONSYNONYMOUS")%>% 
  mutate(Derived_SIFT = ifelse(is.na(Derived_SIFT),100,Derived_SIFT)) %>% 
  mutate(Gerp = ifelse(is.na(Gerp),-100,Gerp)) %>% 
  mutate(LoadGroup = ifelse(Variant_type=="NONSYNONYMOUS",ifelse(Derived_SIFT<0.05&Gerp>=1,"Deleterious","Nonsynonymous"),"Synonymous")) %>% 
  filter(!Ancestral == "NA") %>% 
  filter(Ancestral== Ref|Ancestral==Alt) %>% 
  mutate(IfRefisAnc = if_else(Ref == Ancestral,"Anc","Der")) %>% 
  mutate(CommonOrRare = if_else(Maf <= 0.05, "Rare","Common")) %>%  ### 特别注意,这里根据MAF进行分组,按照维基百科的定义
  group_by(LoadGroup) %>% 
  count(Transcript) %>% 
  summarise(maxNum = max(n),minNum = min(n))

1.8.2.1 ======= Sample size==============

1.9 正文用图:CDF (scale_x)

pfun_CDF_scale_x <- function(df,maxSampleSize,outname){
  group1 <- c("Deleterious","Nonsynonymous","Synonymous")
  colB <- c("#d5311c","#e69f00","#004680")
### 找到每种分类下的最大值
aoFindMaxLine <- function(dfinput){
  maxtaxanum = max(dfinput$TaxaNum)
  dfout <- dfinput %>% filter(TaxaNum==maxtaxanum)
  return(dfout)
}

### 发现每个分类的最大值 "Group"         "TaxaNum"       "ObservedCount" "Loop"    
dfpoint <- df %>% group_by(Group) %>% 
  group_modify(~{.x %>% aoFindMaxLine()}) %>% ### 将每个分类的最大值合并入文件中
  select(Group,TotalNum = ObservedCount) %>% 
  distinct(.,Group,.keep_all = T) ###每种分类有多个循环loop,所以有重复,需要去除

p <- df %>% 
  left_join(.,dfpoint,by = "Group") %>% 
  filter(Group %in% group1) %>% 
  mutate(Group=factor(Group,levels = group1)) %>%
  mutate(CDF = ObservedCount/TotalNum) %>% 
  ggplot(aes(x=TaxaNum,y=CDF))+
  stat_summary(geom = "errorbar"
               ,width = 0.1
               ,fun.min = min
               ,fun.max = max
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
  stat_summary(fun = "mean"
               ,geom = "point"
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
   stat_summary(fun = "mean"
               ,geom = "line"
               ,aes(color = Group)
               # ,alpha = 0.9
               )+
  labs(x="Sample size",y="Number of sites observed\nto be variable")+
  scale_color_manual(values = colB)+
  # labs(x="Sample size",y="CDF")+
  labs(x="Sample size",y="Percent observed")+
  scale_x_log10(limits = c(1,maxSampleSize))+
  # facet_wrap(~Group,ncol = 3,scales = "free")+
  theme_classic()+
  theme(panel.grid = element_blank(), 
      panel.background = element_blank(),
      axis.line = element_line(size=0.4, colour = "black"),
      legend.position = 'none',
      text = element_text(size = 9))


p
ggsave(p,filename = file.path("~/Documents/",str_c(outname,"_CDF_scale_x.pdf")),height = 2.22,width = 1.8)
}

1.9.1 p0函数调用

df <- read_tsv("data/010_sampleSize2VariantsDiscovery/ABsub.txt")
## Rows: 1800 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Group
## dbl (3): TaxaNum, ObservedCount, Loop
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pfun_CDF_scale_x(df,1026,"ABsub")

### -----------------------------------
df <- read_tsv("data/010_sampleSize2VariantsDiscovery/Dsub.txt")
## Rows: 1500 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Group
## dbl (3): TaxaNum, ObservedCount, Loop
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pfun_CDF_scale_x(df,1026,"Dsub")

1.9.1.1 ———————

1.10 分析- LoadGroup

1.10.1 *methd1:count (no scale,scale_x,scale_xy)

pfun_count <- function(df,outname){
  group1 <- c("Deleterious","Nonsynonymous","Synonymous")
  colB <- c("#d5311c","#e69f00","#004680")

p <- df %>% filter(Group %in% group1) %>% 
  mutate(Group=factor(Group,levels = group1)) %>%
  ggplot(aes(x=TaxaNum,y=ObservedCount))+
  stat_summary(geom = "errorbar"
               # ,width = 8   
               ,fun.min = min
               ,fun.max = max
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
  stat_summary(fun = "mean"
               ,geom = "point"
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
   stat_summary(fun = "mean"
               ,geom = "line"
               ,aes(color = Group)
               # ,alpha = 0.9
               )+
  labs(x="Sample size",y="Number of sites observed\nto be variable")+
  scale_color_manual(values = colB)+
  scale_y_continuous(labels = scales::comma_format(big.mark = ','))+
  # facet_wrap(~Group,ncol = 3,scales = "free")+
  theme_classic()+
  theme(panel.grid = element_blank(),
      panel.background = element_blank(),
      axis.line = element_line(size=0.4, colour = "black"),
      legend.position = 'none',
      text = element_text(size = 12))

p
ggsave(p,filename = file.path("~/Documents/",str_c(outname,"_count.pdf")),height = 3,width = 4)
}

pfun_count_scale_x <- function(df,maxSampleSize,outname){
  group1 <- c("Deleterious","Nonsynonymous","Synonymous")
  colB <- c("#d5311c","#e69f00","#004680")
  p <- df %>% filter(Group %in% group1) %>% 
    mutate(Group=factor(Group,levels = group1)) %>%
    ggplot(aes(x=TaxaNum,y=ObservedCount))+
    stat_summary(geom = "errorbar"
               # ,width = 8   
               ,fun.min = min
               ,fun.max = max
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
    stat_summary(fun = "mean"
               ,geom = "point"
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
    stat_summary(fun = "mean"
               ,geom = "line"
               ,aes(color = Group)
               # ,alpha = 0.9
               )+
  labs(x="Sample size",y="Number of sites observed\nto be variable")+
  # scale_color_manual(values = colB)+
  scale_x_log10(limits = c(1,maxSampleSize))+
  scale_color_manual(values = colB)+
  scale_y_continuous(labels = scales::comma_format(big.mark = ','))+
  # facet_wrap(~Group,ncol = 3,scales = "free")+
  theme_classic()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
      panel.background = element_blank(),
      axis.line = element_line(size=0.4, colour = "black"),
      legend.position = 'none',
      text = element_text(size = 9))

p
ggsave(p,filename = file.path("figs/Fig_sampleSize2VariantsDiscovery/misc2",str_c(outname,"_count_scale_x.pdf")),height = 2.22,width = 3.6)
}

pfun_count_scale_xy <- function(df,maxSampleSize,outname){
  group1 <- c("Deleterious","Nonsynonymous","Synonymous")
  colB <- c("#d5311c","#e69f00","#004680")
p <- df %>% filter(Group %in% group1) %>% 
  mutate(Group=factor(Group,levels = group1)) %>%
  ggplot(aes(x=TaxaNum,y=ObservedCount))+
  stat_summary(geom = "errorbar"
               # ,width = 8   
               ,fun.min = min
               ,fun.max = max
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
  stat_summary(fun = "mean"
               ,geom = "point"
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
   stat_summary(fun = "mean"
               ,geom = "line"
               ,aes(color = Group)
               # ,alpha = 0.9
               )+
  labs(x="Sample size",y="Number of sites observed\nto be variable")+
  # scale_color_manual(values = colB)+
  scale_color_manual(values = colB)+
  scale_x_log10(limits = c(1,maxSampleSize))+
  scale_y_continuous(labels = scales::comma_format(big.mark = ','))+
  scale_y_log10()+
  # facet_wrap(~Group,ncol = 3,scales = "free")+
  theme_classic()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
      panel.background = element_blank(),
      axis.line = element_line(size=0.4, colour = "black"),
      legend.position = 'none',
      text = element_text(size = 12))

p
ggsave(p,filename = file.path("~/Documents/",str_c(outname,"_count_scale_xy.pdf")),height = 3,width = 4)
}

1.10.2 p0 函数调用

# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/004_annoDB/010_sampleSize_to_variantsDiscovery/001_out_bySub/ABsub.txt")

df <- read_tsv("data/010_sampleSize2VariantsDiscovery/ABsub.txt")
## Rows: 1800 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Group
## dbl (3): TaxaNum, ObservedCount, Loop
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# pfun_count(df,"ABsub")
pfun_count_scale_x(df,1026,"ABsub")
# pfun_count_scale_xy(df,1026,"ABsub")

# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/004_annoDB/010_sampleSize_to_variantsDiscovery/001_out_bySub/Dsub.txt")
df <- read_tsv("data/010_sampleSize2VariantsDiscovery/Dsub.txt")
## Rows: 1500 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Group
## dbl (3): TaxaNum, ObservedCount, Loop
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# pfun_count(df,"Dsub")
pfun_count_scale_x(df,1026,"Dsub")
# pfun_count_scale_xy(df,1026,"Dsub")

1.10.3 *methd2:CDF (no scale,scale_x,scale_xy)

pfun_CDF <- function(df,outname){

  group1 <- c("Deleterious","Nonsynonymous","Synonymous")
  colB <- c("#d5311c","#e69f00","#004680")
### 找到每种分类下的最大值
aoFindMaxLine <- function(dfinput){
  maxtaxanum = max(dfinput$TaxaNum)
  dfout <- dfinput %>% filter(TaxaNum==maxtaxanum)
  return(dfout)
}

### 发现每个分类的最大值 "Group"         "TaxaNum"       "ObservedCount" "Loop"    
dfpoint <- df %>% group_by(Group) %>% 
  group_modify(~{.x %>% aoFindMaxLine()}) %>% ### 将每个分类的最大值合并入文件中
  select(Group,TotalNum = ObservedCount) %>% 
  distinct(.,Group,.keep_all = T) ###每种分类有多个循环loop,所以有重复,需要去除

p <- df %>% 
  left_join(.,dfpoint,by = "Group") %>% 
  filter(Group %in% group1) %>% 
  mutate(Group=factor(Group,levels = group1)) %>%
  mutate(CDF = ObservedCount/TotalNum) %>% 
  ggplot(aes(x=TaxaNum,y=CDF))+
  stat_summary(geom = "errorbar"
               # ,width = 0.1
               ,fun.min = min
               ,fun.max = max
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
  stat_summary(fun = "mean"
               ,geom = "point"
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
   stat_summary(fun = "mean"
               ,geom = "line"
               ,aes(color = Group)
               # ,alpha = 0.9
               )+
  labs(x="Sample size",y="Number of sites observed\nto be variable")+
  scale_color_manual(values = colB)+
  # labs(x="Sample size",y="CDF")+
  labs(x="Sample size",y="Percent observed")+
  # facet_wrap(~Group,ncol = 3,scales = "free")+
  theme_classic()+
  theme(panel.grid = element_blank(), 
      panel.background = element_blank(),
      axis.line = element_line(size=0.4, colour = "black"),
      legend.position = 'none',
      text = element_text(size = 12))


p
ggsave(p,filename = file.path("~/Documents/",str_c(outname,"_CDF.pdf")),height = 3,width = 4)
}

pfun_CDF_scale_x <- function(df,maxSampleSize,outname){
  group1 <- c("Deleterious","Nonsynonymous","Synonymous")
  colB <- c("#d5311c","#e69f00","#004680")
### 找到每种分类下的最大值
aoFindMaxLine <- function(dfinput){
  maxtaxanum = max(dfinput$TaxaNum)
  dfout <- dfinput %>% filter(TaxaNum==maxtaxanum)
  return(dfout)
}

### 发现每个分类的最大值 "Group"         "TaxaNum"       "ObservedCount" "Loop"    
dfpoint <- df %>% group_by(Group) %>% 
  group_modify(~{.x %>% aoFindMaxLine()}) %>% ### 将每个分类的最大值合并入文件中
  select(Group,TotalNum = ObservedCount) %>% 
  distinct(.,Group,.keep_all = T) ###每种分类有多个循环loop,所以有重复,需要去除

p <- df %>% 
  left_join(.,dfpoint,by = "Group") %>% 
  filter(Group %in% group1) %>% 
  mutate(Group=factor(Group,levels = group1)) %>%
  mutate(CDF = ObservedCount/TotalNum) %>% 
  ggplot(aes(x=TaxaNum,y=CDF))+
  stat_summary(geom = "errorbar"
               ,width = 0.1
               ,fun.min = min
               ,fun.max = max
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
  stat_summary(fun = "mean"
               ,geom = "point"
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
   stat_summary(fun = "mean"
               ,geom = "line"
               ,aes(color = Group)
               # ,alpha = 0.9
               )+
  labs(x="Sample size",y="Number of sites observed\nto be variable")+
  scale_color_manual(values = colB)+
  # labs(x="Sample size",y="CDF")+
  labs(x="Sample size",y="Percent observed")+
  scale_x_log10(limits = c(1,maxSampleSize))+
  # facet_wrap(~Group,ncol = 3,scales = "free")+
  theme_classic()+
  theme(panel.grid = element_blank(), 
      panel.background = element_blank(),
      axis.line = element_line(size=0.4, colour = "black"),
      legend.position = 'none',
      text = element_text(size = 16))


p
ggsave(p,filename = file.path("~/Documents/",str_c(outname,"_CDF_scale_x.pdf")),height = 3,width = 4)
}

pfun_CDF_scale_xy <- function(df,maxSampleSize,outname){
  group1 <- c("Deleterious","Nonsynonymous","Synonymous")
  colB <- c("#d5311c","#e69f00","#004680")
### 找到每种分类下的最大值
aoFindMaxLine <- function(dfinput){
  maxtaxanum = max(dfinput$TaxaNum)
  dfout <- dfinput %>% filter(TaxaNum==maxtaxanum)
  return(dfout)
}

### 发现每个分类的最大值 "Group"         "TaxaNum"       "ObservedCount" "Loop"    
dfpoint <- df %>% group_by(Group) %>% 
  group_modify(~{.x %>% aoFindMaxLine()}) %>% ### 将每个分类的最大值合并入文件中
  select(Group,TotalNum = ObservedCount) %>% 
  distinct(.,Group,.keep_all = T) ###每种分类有多个循环loop,所以有重复,需要去除

p <- df %>% 
  left_join(.,dfpoint,by = "Group") %>% 
  filter(Group %in% group1) %>% 
  mutate(Group=factor(Group,levels = group1)) %>%
  mutate(CDF = ObservedCount/TotalNum) %>% 
  ggplot(aes(x=TaxaNum,y=CDF))+
  stat_summary(geom = "errorbar"
               ,width = 0.1
               ,fun.min = min
               ,fun.max = max
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
  stat_summary(fun = "mean"
               ,geom = "point"
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
   stat_summary(fun = "mean"
               ,geom = "line"
               ,aes(color = Group)
               # ,alpha = 0.9
               )+
  labs(x="Sample size",y="Number of sites observed\nto be variable")+
  scale_color_manual(values = colB)+
  # labs(x="Sample size",y="CDF")+
  labs(x="Sample size",y="Percent observed")+
  scale_x_log10(limits = c(1,maxSampleSize))+
  scale_y_log10()+
  # facet_wrap(~Group,ncol = 3,scales = "free")+
  theme_classic()+
  theme(panel.grid = element_blank(), 
      panel.background = element_blank(),
      axis.line = element_line(size=0.4, colour = "black"),
      legend.position = 'none',
      text = element_text(size = 12))


p
ggsave(p,filename = file.path("~/Documents/",str_c(outname,"_CDF_scale_xy.pdf")),height = 3,width = 4)
}

1.10.4 p0 函数调用

# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/004_annoDB/010_sampleSize_to_variantsDiscovery/001_out_bySub/ABsub.txt")
df <- read_tsv("data/010_sampleSize2VariantsDiscovery/ABsub.txt")
## Rows: 1800 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Group
## dbl (3): TaxaNum, ObservedCount, Loop
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pfun_CDF(df,"ABsub")
pfun_CDF_scale_x(df,1026,"ABsub")
pfun_CDF_scale_xy(df,1026,"ABsub")

# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/004_annoDB/010_sampleSize_to_variantsDiscovery/001_out_bySub/Dsub.txt")
df <- read_tsv("data/010_sampleSize2VariantsDiscovery/Dsub.txt")
## Rows: 1500 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Group
## dbl (3): TaxaNum, ObservedCount, Loop
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pfun_CDF(df,"Dsub")
pfun_CDF_scale_x(df,1026,"Dsub")
pfun_CDF_scale_xy(df,1026,"Dsub")

1.10.5 *methd3:Increasement (no scale,scale_x,scale_xy)

pfun_Increasement <- function(df,maxSampleSize ,outname){ 

  group1 <- c("Deleterious","Nonsynonymous","Synonymous")
  colB <- c("#d5311c","#e69f00","#004680")

  ## 建立2个数据框,第一个是去除taxaNum为1的数据框 df_taxaNumNo1 ;第二个是去除taxaNum为最大值的数据框 df_taxaNumNomax
  df_taxaNumNo1 <- df %>% filter(TaxaNum != 1) %>% arrange(TaxaNum,Loop,Group)
  df_taxaNumNomax <- df %>% filter(TaxaNum != maxSampleSize) %>% arrange(TaxaNum,Loop,Group)
  df_taxaNumNo1$Increasement <- df_taxaNumNo1$ObservedCount - df_taxaNumNomax$ObservedCount
  dftotalnum <- df %>% filter(TaxaNum == maxSampleSize,Loop==1) %>% select(Group, ObservedCount)
  
  df <- df_taxaNumNo1 %>%left_join(dftotalnum,by="Group") %>% 
  mutate(percent = Increasement/ObservedCount.y)

p <- df %>% filter(Group %in% group1) %>% 
  mutate(Group=factor(Group,levels = group1)) %>%
  ggplot(aes(x=TaxaNum,y=Increasement))+
  stat_summary(geom = "errorbar"
               ,width = 20   
               ,fun.min = min
               ,fun.max = max
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
  stat_summary(fun = "mean"
               ,geom = "point"
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
   stat_summary(fun = "mean"
               ,geom = "line"
               ,aes(color = Group)
               # ,alpha = 0.9
               )+
  labs(x="Sample size",y="New sites observed\nto be variable per additional sample")+
  scale_color_manual(values = colB)+
  scale_y_continuous(labels = scales::comma_format(big.mark = ','))+
  # facet_wrap(~Group,ncol = 3,scales = "free")+
  theme_classic()+
  theme(panel.grid = element_blank(),
      panel.background = element_blank(),
      axis.line = element_line(size=0.4, colour = "black"),
      legend.position = 'none',
      text = element_text(size = 12))

p
ggsave(p,filename = file.path("~/Documents/",str_c(outname,"_Increasement.pdf")),height = 3,width = 4)
}

pfun_Increasement_scale_x <- function(df,maxSampleSize,outname){
  group1 <- c("Deleterious","Nonsynonymous","Synonymous")
  colB <- c("#d5311c","#e69f00","#004680")
  ## 建立2个数据框,第一个是去除taxaNum为1的数据框 df_taxaNumNo1 ;第二个是去除taxaNum为最大值的数据框 df_taxaNumNomax
  df_taxaNumNo1 <- df %>% filter(TaxaNum != 1) %>% arrange(TaxaNum,Loop,Group)
  df_taxaNumNomax <- df %>% filter(TaxaNum != maxSampleSize) %>% arrange(TaxaNum,Loop,Group)
  df_taxaNumNo1$Increasement <- df_taxaNumNo1$ObservedCount - df_taxaNumNomax$ObservedCount
  dftotalnum <- df %>% filter(TaxaNum == maxSampleSize,Loop==1) %>% select(Group, ObservedCount)

    df <- df_taxaNumNo1 %>%left_join(dftotalnum,by="Group") %>% 
  mutate(percent = Increasement/ObservedCount.y)
    
p <- df %>% filter(Group %in% group1) %>% 
  mutate(Group=factor(Group,levels = group1)) %>%
  ggplot(aes(x=TaxaNum,y=Increasement))+
  stat_summary(geom = "errorbar"
               ,width = 40 
               ,fun.min = min
               ,fun.max = max
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
  stat_summary(fun = "mean"
               ,geom = "point"
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
   stat_summary(fun = "mean"
               ,geom = "line"
               ,aes(color = Group)
               # ,alpha = 0.9
               )+
  labs(x="Sample size",y="New sites observed\nto be variable per additional sample")+
  scale_color_manual(values = colB)+
  scale_x_log10(limits = c(100,maxSampleSize))+
  scale_y_continuous(labels = scales::comma_format(big.mark = ','))+
  # facet_wrap(~Group,ncol = 3,scales = "free")+
  theme_classic()+
  theme(panel.grid = element_blank(),
      panel.background = element_blank(),
      axis.line = element_line(size=0.4, colour = "black"),
      legend.position = 'none',
      text = element_text(size = 9))

p
ggsave(p,filename = file.path("~/Documents/",str_c(outname,"_Increasement_scale_x.pdf")),height = 2.22,width = 3.6)
}


pfun_Increasement_scale_xy <- function(df,maxSampleSize,outname){
  group1 <- c("Deleterious","Nonsynonymous","Synonymous")
  colB <- c("#d5311c","#e69f00","#004680")
  ## 建立2个数据框,第一个是去除taxaNum为1的数据框 df_taxaNumNo1 ;第二个是去除taxaNum为最大值的数据框 df_taxaNumNomax
  df_taxaNumNo1 <- df %>% filter(TaxaNum != 1) %>% arrange(TaxaNum,Loop,Group)
  df_taxaNumNomax <- df %>% filter(TaxaNum != maxSampleSize) %>% arrange(TaxaNum,Loop,Group)
  df_taxaNumNo1$Increasement <- df_taxaNumNo1$ObservedCount - df_taxaNumNomax$ObservedCount
  dftotalnum <- df %>% filter(TaxaNum == maxSampleSize,Loop==1) %>% select(Group, ObservedCount)

    df <- df_taxaNumNo1 %>%left_join(dftotalnum,by="Group") %>% 
  mutate(percent = Increasement/ObservedCount.y)
    
p <- df %>% filter(Group %in% group1) %>% 
  mutate(Group=factor(Group,levels = group1)) %>%
  ggplot(aes(x=TaxaNum,y=Increasement))+
  stat_summary(geom = "errorbar"
               ,width = 40   
               ,fun.min = min
               ,fun.max = max
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
  stat_summary(fun = "mean"
               ,geom = "point"
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
   stat_summary(fun = "mean"
               ,geom = "line"
               ,aes(color = Group)
               # ,alpha = 0.9
               )+
  labs(x="Sample size",y="New sites observed\nto be variable per additional sample")+
  scale_color_manual(values = colB)+
  scale_x_log10(limits = c(100,maxSampleSize))+
  scale_y_log10()+
  # scale_y_continuous(labels = scales::comma_format(big.mark = ','))+
  # facet_wrap(~Group,ncol = 3,scales = "free")+
  theme_classic()+
  theme(panel.grid = element_blank(),
      panel.background = element_blank(),
      axis.line = element_line(size=0.4, colour = "black"),
      legend.position = 'none',
      text = element_text(size = 12))

p
ggsave(p,filename = file.path("~/Documents/",str_c(outname,"_Increasement_scale_xy.pdf")),height = 3,width = 4)
}

1.10.6 p0 函数调用

# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/004_annoDB/010_sampleSize_to_variantsDiscovery/001_out_bySub/ABsub.txt")
df <- read_tsv("data/010_sampleSize2VariantsDiscovery/ABsub.txt")
## Rows: 1800 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Group
## dbl (3): TaxaNum, ObservedCount, Loop
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# pfun_Increasement(df,1026,"ABsub")
pfun_Increasement_scale_x(df,1026,"ABsub")
# pfun_Increasement_scale_xy(df,1026,"ABsub")

# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/004_annoDB/010_sampleSize_to_variantsDiscovery/001_out_bySub/Dsub.txt")
df <- read_tsv("data/010_sampleSize2VariantsDiscovery/Dsub.txt")
## Rows: 1500 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Group
## dbl (3): TaxaNum, ObservedCount, Loop
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# pfun_Increasement(df,850,"Dsub")
pfun_Increasement_scale_x(df,850,"Dsub")
# pfun_Increasement_scale_xy(df,850,"Dsub")

1.10.7 *methd4:Increasement CDF (no scale,scale_x,scale_xy)

pfun_Increasement_CDF <- function(df,maxSampleSize ,outname){ 

  group1 <- c("Deleterious","Nonsynonymous","Synonymous")
  colB <- c("#d5311c","#e69f00","#004680")
  ## 建立2个数据框,第一个是去除taxaNum为1的数据框 df_taxaNumNo1 ;第二个是去除taxaNum为最大值的数据框 df_taxaNumNomax
  df_taxaNumNo1 <- df %>% filter(TaxaNum != 1) %>% arrange(TaxaNum,Loop,Group)
  df_taxaNumNomax <- df %>% filter(TaxaNum != maxSampleSize) %>% arrange(TaxaNum,Loop,Group)
  df_taxaNumNo1$Increasement <- df_taxaNumNo1$ObservedCount - df_taxaNumNomax$ObservedCount
  dftotalnum <- df %>% filter(TaxaNum == maxSampleSize,Loop==1) %>% select(Group, ObservedCount)
  
  df <- df_taxaNumNo1 %>%left_join(dftotalnum,by="Group") %>% 
  mutate(percent = Increasement/ObservedCount.y)

p <- df %>% filter(Group %in% group1) %>% 
  mutate(Group=factor(Group,levels = group1)) %>%
  ggplot(aes(x=TaxaNum,y=percent))+
  stat_summary(geom = "errorbar"
               ,width = 50   
               ,fun.min = min
               ,fun.max = max
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
  stat_summary(fun = "mean"
               ,geom = "point"
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
   stat_summary(fun = "mean"
               ,geom = "line"
               ,aes(color = Group)
               # ,alpha = 0.9
               )+
  labs(x="Sample size",y="Percent of new sites observed\nto be variable per additional sample")+
  scale_color_manual(values = colB)+
  scale_y_continuous(labels = scales::comma_format(big.mark = ','))+
  # facet_wrap(~Group,ncol = 3,scales = "free")+
  theme_classic()+
  theme(panel.grid = element_blank(),
      panel.background = element_blank(),
      axis.line = element_line(size=0.4, colour = "black"),
      legend.position = 'none',
      text = element_text(size = 12))

p
ggsave(p,filename = file.path("~/Documents/",str_c(outname,"_Increasement_CDF.pdf")),height = 3,width = 4)
}

pfun_Increasement_CDF_scale_x <- function(df,maxSampleSize,outname){
  group1 <- c("Deleterious","Nonsynonymous","Synonymous")
  colB <- c("#d5311c","#e69f00","#004680")
  ## 建立2个数据框,第一个是去除taxaNum为1的数据框 df_taxaNumNo1 ;第二个是去除taxaNum为最大值的数据框 df_taxaNumNomax
  df_taxaNumNo1 <- df %>% filter(TaxaNum != 1) %>% arrange(TaxaNum,Loop,Group)
  df_taxaNumNomax <- df %>% filter(TaxaNum != maxSampleSize) %>% arrange(TaxaNum,Loop,Group)
  df_taxaNumNo1$Increasement <- df_taxaNumNo1$ObservedCount - df_taxaNumNomax$ObservedCount
  dftotalnum <- df %>% filter(TaxaNum == maxSampleSize,Loop==1) %>% select(Group, ObservedCount)

    df <- df_taxaNumNo1 %>%left_join(dftotalnum,by="Group") %>% 
  mutate(percent = Increasement/ObservedCount.y)
    
p <- df %>% filter(Group %in% group1) %>% 
  mutate(Group=factor(Group,levels = group1)) %>%
  ggplot(aes(x=TaxaNum,y=percent))+
  stat_summary(geom = "errorbar"
               # ,width = 50   
               ,fun.min = min
               ,fun.max = max
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
  stat_summary(fun = "mean"
               ,geom = "point"
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
   stat_summary(fun = "mean"
               ,geom = "line"
               ,aes(color = Group)
               # ,alpha = 0.9
               )+
  labs(x="Sample size",y="Percent of new sites observed\nto be variable per additional sample")+
  scale_color_manual(values = colB)+
  scale_x_log10(limits = c(100,maxSampleSize))+
  # scale_y_continuous(labels = scales::comma_format(big.mark = ','))+
  # facet_wrap(~Group,ncol = 3,scales = "free")+
  theme_classic()+
  theme(panel.grid = element_blank(),
      panel.background = element_blank(),
      axis.line = element_line(size=0.4, colour = "black"),
      legend.position = 'none',
      text = element_text(size = 9))

p
ggsave(p,filename = file.path("~/Documents/",str_c(outname,"_Increasement_CDF_scale_x.pdf")),height = 2.22,width = 3.6)
}


pfun_Increasement_CDF_scale_xy <- function(df,maxSampleSize,outname){
  group1 <- c("Deleterious","Nonsynonymous","Synonymous")
  colB <- c("#d5311c","#e69f00","#004680")
  ## 建立2个数据框,第一个是去除taxaNum为1的数据框 df_taxaNumNo1 ;第二个是去除taxaNum为最大值的数据框 df_taxaNumNomax
  df_taxaNumNo1 <- df %>% filter(TaxaNum != 1) %>% arrange(TaxaNum,Loop,Group)
  df_taxaNumNomax <- df %>% filter(TaxaNum != maxSampleSize) %>% arrange(TaxaNum,Loop,Group)
  df_taxaNumNo1$Increasement <- df_taxaNumNo1$ObservedCount - df_taxaNumNomax$ObservedCount
  dftotalnum <- df %>% filter(TaxaNum == maxSampleSize,Loop==1) %>% select(Group, ObservedCount)

    df <- df_taxaNumNo1 %>%left_join(dftotalnum,by="Group") %>% 
  mutate(percent = Increasement/ObservedCount.y)
    
p <- df %>% filter(Group %in% group1) %>% 
  mutate(Group=factor(Group,levels = group1)) %>%
  ggplot(aes(x=TaxaNum,y=percent))+
  stat_summary(geom = "errorbar"
               ,width = 50  
               ,fun.min = min
               ,fun.max = max
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
  stat_summary(fun = "mean"
               ,geom = "point"
               ,aes(color = Group)
               # ,alpha = 0.5
               )+ 
   stat_summary(fun = "mean"
               ,geom = "line"
               ,aes(color = Group)
               # ,alpha = 0.9
               )+
  labs(x="Sample size",y="Percent of new sites observed\nto be variable per additional sample")+
  scale_color_manual(values = colB)+
  scale_x_log10(limits = c(100,maxSampleSize))+
  scale_y_log10()+
  # scale_y_continuous(labels = scales::comma_format(big.mark = ','))+
  # facet_wrap(~Group,ncol = 3,scales = "free")+
  theme_classic()+
  theme(panel.grid = element_blank(),
      panel.background = element_blank(),
      axis.line = element_line(size=0.4, colour = "black"),
      legend.position = 'none',
      text = element_text(size = 12))

p
ggsave(p,filename = file.path("~/Documents/",str_c(outname,"_Increasement_CDF_scale_xy.pdf")),height = 3,width = 4)
}

1.10.8 p0 函数调用

# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/004_annoDB/010_sampleSize_to_variantsDiscovery/001_out_bySub/ABsub.txt")
df <- read_tsv("data/010_sampleSize2VariantsDiscovery/ABsub.txt")
## Rows: 1800 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Group
## dbl (3): TaxaNum, ObservedCount, Loop
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# pfun_Increasement_CDF(df,1026,"ABsub")
pfun_Increasement_CDF_scale_x(df,1026,"ABsub")
# pfun_Increasement_CDF_scale_xy(df,1026,"ABsub")

# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/004_annoDB/010_sampleSize_to_variantsDiscovery/001_out_bySub/Dsub.txt")
df <- read_tsv("data/010_sampleSize2VariantsDiscovery/Dsub.txt")
## Rows: 1500 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Group
## dbl (3): TaxaNum, ObservedCount, Loop
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# pfun_Increasement_CDF(df,850,"Dsub")
pfun_Increasement_CDF_scale_x(df,850,"Dsub")
# pfun_Increasement_CDF_scale_xy(df,850,"Dsub")

1.10.8.1 ======================

1.11 Allele age

1.11.1 A:SNP count with allele age

col <- c("#004680","#e69f00","#d5311c")
## probably damaging [0.957,1]
## possibly damaging [0.453,0.956]
dfAlleleAge <- dftotal %>% 
  filter(Region=="CDS",Variant_type=="SYNONYMOUS"|Variant_type=="NONSYNONYMOUS")%>%
  filter(!is.na(Ancestral),Ancestral == Ref | Ancestral == Alt) %>%
  mutate(Gerp_16way=ifelse(is.na(Gerp_16way),-100,Gerp_16way)) %>%
  mutate(Derived_PolyPhen2_prob=ifelse(is.na(Derived_PolyPhen2_prob),-100,Derived_PolyPhen2_prob)) %>%
  mutate(LoadGroup=ifelse(Derived_PolyPhen2_prob>=0.5 & Variant_type %in% c("NONSYNONYMOUS") & Gerp_16way >=1.5, "Deleterious", ifelse(Variant_type=="SYNONYMOUS", "Synonymous", ifelse(Variant_type=="NONSYNONYMOUS", "Nonsynonymous", NA)))) %>% 
  filter(!is.na(LoadGroup)) %>%
  filter(!is.na(AlleleAge_J), !is.na(AlleleAge_M), !is.na(AlleleAge_R)) %>%
  mutate(LoadGroup=factor(LoadGroup, levels = c("Synonymous","Nonsynonymous","Deleterious")))

df_db_summary <- dfAlleleAge %>% 
  group_by(LoadGroup,Sub) %>% 
  summarise(n=n())
## `summarise()` has grouped output by 'LoadGroup'. You can override using the `.groups` argument.
q <- dfAlleleAge %>% 
  ggplot(aes(x=Sub, fill=LoadGroup))+
  geom_bar(position = position_dodge(width = 0.9))+
  geom_text(data = df_db_summary, aes(label=n, y=n), 
            position = position_dodge(width = 0.9), size=2.5)+
  scale_fill_manual(values = col)+
  labs(y="The num of SNP having allele age")+
  theme_classic()+
  theme(legend.title = element_blank(),
        legend.position = "none",
        legend.background = element_rect(fill="transparent"),
        strip.background = element_blank(),
        axis.title.x = element_blank(),
        axis.line = element_blank(),
        # text = element_text(size = 14),
        panel.border = element_rect(colour = "black",fill = "transparent"))
q

1.11.2 B: allele age distribution

q <- dfAlleleAge %>% 
  mutate(Region=factor(Region, levels = c("UTR_5","Intron","CDS","UTR_3"))) %>% 
  ggplot(aes(x=AlleleAge_J, fill=LoadGroup))+
  geom_histogram(aes(y=..count..), color="white")+
  facet_grid(Sub~LoadGroup, scales = "free")+
  # scale_color_brewer(palette = "Set2")+
  # scale_fill_brewer(palette = "Set2")+
  scale_fill_manual(values = col)+
  scale_color_manual(values = col)+
  scale_x_continuous(trans = "log10",
                     breaks = scales::trans_breaks("log10", function(x) 10^x),
                     labels = scales::trans_format("log10",   scales::math_format(10^.x)))+
  labs(x="Mutation age (Years)")+
  theme_classic()+
  theme(legend.title = element_blank(),
        legend.position = "none",
        legend.background = element_rect(fill="transparent"),
        strip.background = element_blank(),
        # axis.title.x = element_blank(),
        axis.line = element_blank(),
        text = element_text(size = 16),
        panel.border = element_rect(colour = "black",fill = "transparent"))
q
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

1.11.3 C: allele age vs DAF

q <- dfAlleleAge %>% 
  group_by(Sub,LoadGroup) %>% 
  slice_sample(prop = 0.2) %>%
  ggplot(aes(x=AlleleAge_J, y=DAF,color=LoadGroup, fill=LoadGroup))+
  geom_point(alpha=0.2)+
  facet_grid(Sub~LoadGroup)+
  scale_color_manual(values = col)+
  scale_fill_manual(values = col)+
  scale_x_continuous(trans = "log10",
                     # breaks = scales::trans_breaks("log10", function(x) 10^x),
                     labels = scales::trans_format("log10",   scales::math_format(10^.x)))+
  labs(x="Mutation age (Years)",y="Derived allele frequency")+
  theme_classic()+
  theme(legend.title = element_blank(),
        legend.position = "none",
        legend.background = element_rect(fill="transparent"),
        strip.background = element_blank(),
        # axis.title.x = element_blank(),
        axis.line = element_blank(),
        text = element_text(size = 16),
        panel.border = element_rect(colour = "black",fill = "transparent"))
q

ggsave(plot = q, "~/Documents/temp.pdf", width = 6, height = 4)
q <- dfAlleleAge %>% 
  group_by(Sub,LoadGroup) %>% 
  slice_sample(prop = 0.2) %>%
  ggplot(aes(x=DAF, y=AlleleAge_J,color=LoadGroup, fill=LoadGroup))+
  geom_point(alpha=0.2)+
  facet_grid(Sub~LoadGroup)+
  scale_color_manual(values = col)+
  scale_fill_manual(values = col)+
  scale_y_continuous(trans = "log10",
                     # breaks = scales::trans_breaks("log10", function(x) 10^x),
                     labels = scales::trans_format("log10",scales::math_format(10^.y)))+
  # labs(x="Mutation age (Years)",y="Derived allele frequency")+
  theme_classic()+
  theme(legend.title = element_blank(),
        legend.position = "none",
        legend.background = element_rect(fill="transparent"),
        strip.background = element_blank(),
        # axis.title.x = element_blank(),
        axis.line = element_blank(),
        text = element_text(size = 16),
        panel.border = element_rect(colour = "black",fill = "transparent"))
q

ggsave(plot = q, "~/Documents/temp.pdf", width = 6, height = 4)

1.11.4 ** 密度图allele age vs DAF (2D plot)

p <- dfAlleleAge %>% 
  group_by(Sub,LoadGroup) %>% 
  slice_sample(prop = 0.5) %>%
  ggplot(aes(x=AlleleAge_J, y=DAF))+
  # stat_density_2d(aes(x = AlleleAge_J, y = DAF, fill = stat(nlevel)),
  #                 geom = "polygon", n = 500, bins = 10, contour = TRUE) +
  # geom_bin2d(aes(x = AlleleAge_J, y = DAF),bins = 10) +
  ### raster 方法
  stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE) +
  scale_fill_distiller(palette=4, direction=1)+
  # scale_fill_fermenter(palette=4, direction=1)+
  # scale_fill_gradient()+
  ### hex 方法
  # geom_hex(bins = 50) +
  # scale_fill_gradient(
  #   low = "#9696F2", 
  #   high = "#0A0A3D")+

  # geom_point(alpha=0.5)+
  facet_grid(.~LoadGroup)+
  # scale_color_manual(values = col)+
  # scale_fill_manual(values = col)+
  # scale_fill_viridis_c(option = "A")+
  # scale_fill_continuous(type="viridis")+
  scale_x_continuous(trans = "log10",
                     expand = c(0, 0),
                     # breaks = scales::trans_breaks("log10", function(x) 10^x),
                     labels = scales::trans_format("log10",   scales::math_format(10^.x)))+
  scale_y_continuous(expand = c(0, 0))+
  labs(x="Derived allele age (Years)",y="Derived allele frequency")+
  theme_classic()+
  theme(legend.title = element_blank(),
        # legend.position = "none",
        legend.background = element_rect(fill="transparent"),
        strip.background = element_blank(),
        axis.line = element_blank(),
        text = element_text(size = 9),
        panel.border = element_rect(colour = "black",fill = "transparent"))
p

ggsave(p, filename = "~/Documents/test.pdf", height = 2.22, width = 3.6)

1.11.5 D: Del proportion on diff allele age

dabins <- c(0,1,2,4,8,16,32, 64, 128)
# ### 分亚基因组
# df <- dfAlleleAge %>% 
#   mutate(bin=cut(AlleleAge_J/1000,breaks=dabins)) %>%
#   filter(!is.na(LoadGroup), !is.na(bin)) %>% 
#   group_by(Sub,bin) %>% 
#   count(LoadGroup) %>% 
#   mutate(fre=n/sum(n)) 

# q <- df %>% 
#   # filter(Sub=="A") %>% 
#   ggplot(aes(x=bin, y=fre, color=LoadGroup,group=LoadGroup))+
#   geom_point()+
#   geom_line()+
#   facet_grid(.~Sub)+
#   scale_color_manual(values = col)+
#   scale_x_discrete(label=dabins)+
#   labs(x="Derived allele age (Ka)",y="Deleterious SNPs (%)") +
#   theme_classic()+
#   theme(legend.title = element_blank(),
#         legend.position = "none",
#         legend.background = element_rect(fill="transparent"),
#         strip.background = element_blank(),
#         axis.line = element_blank(),
#         panel.border = element_rect(colour = "black",fill = "transparent"))
# q
# 
# ggsave(plot = q, "~/Documents/temp.pdf", width = 5, height = 3)

### 不分亚基因组

df <- dfAlleleAge %>% 
  mutate(bin=cut(AlleleAge_J/1000,breaks=dabins)) %>%
  filter(!is.na(LoadGroup), !is.na(bin)) %>% 
  group_by(bin) %>% 
  count(LoadGroup) %>% 
  mutate(fre=n/sum(n)) 

p <- df %>% 
  ggplot(aes(x=bin, y=fre, color=LoadGroup,group=LoadGroup))+
  geom_point()+
  geom_line()+
  scale_color_manual(values = col)+
  scale_x_discrete(label=dabins)+
  labs(x="Derived allele age (Ka)",y="Deleterious SNPs (%)") +
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7),
    legend.position = 'none',
    text = element_text(size = 9))

p

ggsave(plot = p, "~/Documents/test.pdf", height = 2.22,width = 1.8)

1.11.5.1 =======================

1.12 1.Gene expression vs strongly burden

df_burden1 <- read_tsv("data/fromDaxing/Burden_Expression/004_burdenMean_vs_expression/GeneBurden.averageIndividualBurden.txt.gz")
## Rows: 156026 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): geneName, SlightlyOrStrongly
## dbl (4): n, mean_burden, sd_burden, rsd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_expression <- read_tsv("data/fromDaxing/Burden_Expression/004_burdenMean_vs_expression/GeneExpression.averageTPM.txt.gz")
## Rows: 108061 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene, Subgenome
## dbl (7): TPM_mean, TPM_sd, TPM_max, TPM_min, TPM_RSD, TPM_median, TPM_sum
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_burden1_expression <- df_burden1 %>% 
  left_join(y = df_expression,by = c("geneName"="Gene")) %>% 
  filter(!is.na(Subgenome))

temp <- df_burden1_expression %>% 
  # filter(SlightlyOrStrongly=="StronglyBurden") %>%
  mutate(TPM_mean_log = log2(TPM_mean)) %>% 
  mutate(bin=cut_width(TPM_mean_log,width = 1,boundary = 0)) %>% 
  group_by(SlightlyOrStrongly, Subgenome,bin) %>% 
  count() %>%   
  filter(n > 20, !is.na(bin))

colN1 <- brewer.pal(9,"Reds")[c(7)]
colA <- colorRampPalette(brewer.pal(9,"Reds")[c(2,4,5,6,7,8,9)])(13)
colC <- rev(colorRampPalette(brewer.pal(9,"Reds")[c(2,4,5,6,7,8,9)])(8))
colN2 <- brewer.pal(9,"Reds")[c(9)]

colB <- c(colN1,colA,colC,colN2)
# colB <- colorRampPalette(brewer.pal(9,"Reds")[c(2,4,5,6,7,8,9)])(40)
# xLabel <- expression(2^{-12},"",2^{-8},"",2^{-4},"",2^{0},"",2^{4},"",2^{8},"")
# xLabel <- expression("",2^{-12},"","",2^{-9},"","",2^{-6},"","",2^{-3},"","",2^{0},"","",2^{3},"","",2^{6},"","",2^{9})
xLabel <- expression("",2^{-12},"","",2^{-9},"","",2^{-6},"","",2^{-3},"","",2^{0},"","",2^{3},"","",2^{6},"","",2^{9})
p <- df_burden1_expression %>% 
  filter(SlightlyOrStrongly=="StronglyBurden") %>%
  mutate(TPM_mean_log = log2(TPM_mean)) %>% 
  mutate(bin=cut_width(TPM_mean_log,width = 1,boundary = 0)) %>% 
  filter(bin %in% temp$bin) %>%
  ggplot(aes(x=bin, y=mean_burden)) +
# stat_boxplot(geom = "errorbar",width=0.05,position = position_dodge(1))+ # add error bar
# geom_boxplot(aes(fill=bin),outlier.shape = NA) +
# geom_point(position = position_jitter(width = 0.2), size=0.1)+
# stat_summary(fun=median, geom="point", color="blue",
#              position = position_dodge(1),
#              size=0.1)+
  stat_summary(aes(color=bin),fun.data = mean_se,size=0.2)+
  scale_fill_manual(values = colB)+
  # facet_grid(SlightlyOrStrongly~.,scales = "free")+
  labs(x="TPM", y= "Gene burden")+
  # scale_y_continuous(trans = "log2",
  #                    breaks = trans_breaks("log2",function(x) 2^x),
  #                    labels = trans_format("log2",math_format(2^.x)))+
  scale_x_discrete(breaks=c("(-12,-11]","(-9,-8]","(-6,-5]","(-3,-2]","(0,1]","(3,4]","(6,7]","(9,10]"),
                   labels=expression(2^{-12},2^{-9},2^{-6},2^{-3},2^{0},2^{3},2^{6},2^{9}))+
  # scale_x_discrete(labels=xLabel)+
  scale_color_manual(values = colB)+
  coord_cartesian(ylim = c(0,0.04))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7),
    legend.position = 'none',
    text = element_text(size = 9))

p
## Warning: Removed 49751 rows containing non-finite values (stat_summary).

ggsave(plot = p, "~/Documents//Burden1_vs_TPM.pdf",height = 2,width = 2)
## Warning: Removed 49751 rows containing non-finite values (stat_summary).
dff <- df_burden1_expression %>% 
  filter(SlightlyOrStrongly=="StronglyBurden") %>%
  mutate(TPM_mean_log = log2(TPM_mean)) %>% 
  filter(!is.na(mean_burden), is.finite(mean_burden),mean_burden!=0)

1.13 2.Gene expression vs slightly burden

df_burden1 <- read_tsv("data/fromDaxing/Burden_Expression/004_burdenMean_vs_expression/GeneBurden.averageIndividualBurden.txt.gz")
## Rows: 156026 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): geneName, SlightlyOrStrongly
## dbl (4): n, mean_burden, sd_burden, rsd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_expression <- read_tsv("data/fromDaxing/Burden_Expression/004_burdenMean_vs_expression/GeneExpression.averageTPM.txt.gz")
## Rows: 108061 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene, Subgenome
## dbl (7): TPM_mean, TPM_sd, TPM_max, TPM_min, TPM_RSD, TPM_median, TPM_sum
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_burden1_expression <- df_burden1 %>% 
  left_join(y = df_expression,by = c("geneName"="Gene")) %>% 
  filter(!is.na(Subgenome))

temp <- df_burden1_expression %>% 
  # filter(SlightlyOrStrongly=="StronglyBurden") %>%
  mutate(TPM_mean_log = log2(TPM_mean)) %>% 
  mutate(bin=cut_width(TPM_mean_log,width = 1,boundary = 0)) %>% 
  group_by(SlightlyOrStrongly, Subgenome,bin) %>% 
  count() %>%   
  filter(n > 20, !is.na(bin))

colN1 <- brewer.pal(9,"Reds")[c(7)]
colA <- colorRampPalette(brewer.pal(9,"Reds")[c(2,4,5,6,7,8,9)])(13)
colC <- rev(colorRampPalette(brewer.pal(9,"Reds")[c(2,4,5,6,7,8,9)])(8))
colN2 <- brewer.pal(9,"Reds")[c(9)]

colB <- c(colN1,colA,colC,colN2)
# colB <- colorRampPalette(brewer.pal(9,"Reds")[c(2,4,5,6,7,8,9)])(40)
# xLabel <- expression(2^{-12},"",2^{-8},"",2^{-4},"",2^{0},"",2^{4},"",2^{8},"")
# xLabel <- expression("",2^{-12},"","",2^{-9},"","",2^{-6},"","",2^{-3},"","",2^{0},"","",2^{3},"","",2^{6},"","",2^{9})
xLabel <- expression("",2^{-12},"","",2^{-9},"","",2^{-6},"","",2^{-3},"","",2^{0},"","",2^{3},"","",2^{6},"","",2^{9})
p <- df_burden1_expression %>% 
  filter(SlightlyOrStrongly=="SlightlyBurden") %>%
  mutate(TPM_mean_log = log2(TPM_mean)) %>% 
  mutate(bin=cut_width(TPM_mean_log,width = 1,boundary = 0)) %>% 
  filter(bin %in% temp$bin) %>%
  ggplot(aes(x=bin, y=mean_burden)) +
# stat_boxplot(geom = "errorbar",width=0.05,position = position_dodge(1))+ # add error bar
# geom_boxplot(aes(fill=bin),outlier.shape = NA) +
# geom_point(position = position_jitter(width = 0.2), size=0.1)+
# stat_summary(fun=median, geom="point", color="blue",
#              position = position_dodge(1),
#              size=0.1)+
  stat_summary(aes(color=bin),fun.data = mean_se,size=0.2)+
  scale_fill_manual(values = colB)+
  # facet_grid(SlightlyOrStrongly~.,scales = "free")+
  labs(x="TPM", y= "Slightly gene burden")+
  # scale_y_continuous(trans = "log2",
  #                    breaks = trans_breaks("log2",function(x) 2^x),
  #                    labels = trans_format("log2",math_format(2^.x)))+
  scale_x_discrete(breaks=c("(-12,-11]","(-9,-8]","(-6,-5]","(-3,-2]","(0,1]","(3,4]","(6,7]","(9,10]"),
                   labels=expression(2^{-12},2^{-9},2^{-6},2^{-3},2^{0},2^{3},2^{6},2^{9}))+
  scale_x_discrete(labels=xLabel)+
  scale_color_manual(values = colB)+
  # coord_cartesian(ylim = c(0,0.04))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7),
    legend.position = 'none',
    text = element_text(size = 9))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
p
## Warning: Removed 61433 rows containing non-finite values (stat_summary).

ggsave(plot = p, "~/Documents//Burden1_vs_TPM.pdf",height = 2,width = 2)
## Warning: Removed 61433 rows containing non-finite values (stat_summary).

1.14 3.Tissue expression VS mutation burden

  • caption: Deleterious mutation of genes expressed across tissues. conclusion: Deleterious mutations are depleted in genes expressed in grain and anther.
### Step1: 将29万个基因在35个组织中的表达量过滤,只保留高置信度的最长转录本;将Load的结果也合并到HC数据库中
### 第一个数据框
df_expression <- read_tsv("data/011_geneExpression/expressionByTissue.txt.gz")

### 第二个数据框
dataFile <- c("data/fromDaxing/Burden_Expression/004_burdenMean_vs_expression/GeneBurden.averageIndividualBurden.txt.gz")

df_load <- read_tsv(dataFile) %>%
  rename(Gene = geneName)

dataFile <- c("data/011_geneExpression/geneHC.txt.gz")
df_geneHC <- read_tsv(dataFile) %>%
  rename(Transcripts = Longest_transcript) %>%
  left_join(.,df_expression,by="Transcripts")  ###合并第1个数据框

## expression 文件与load文件合并
  df_expression_load <- df_load %>%
    left_join(.,df_geneHC,by="Gene")  ###合并第2个数据框

# write.table(df_geneHC,file="~/Documents/test.txt",quote=F,col.name=T,row.names=F,sep = "\t")
  
  
  ### Step2: 转换表格,将组织表达 -> 长表
df_expression_load_long <- df_expression_load %>%
  pivot_longer(.,cols = "Internode #2":"third leaf sheath", names_to = "Tissue",values_to = "TPM") %>%
  filter(TPM < 0.5)

### 计算load和组织的关系
dfstatistic <- df_expression_load_long %>%
  # pivot_longer(.,cols = FreSyn:RatioHGDelVSSyn,names_to = "LoadType",values_to = "LoadValue") %>%
  filter(!is.na(mean_burden), is.finite(mean_burden)) %>%
  group_by(SlightlyOrStrongly,Tissue) %>% ### 计算不同分类下,不同组织load的平均值 标准差
  summarise(ave = mean(mean_burden), sd = sd(mean_burden), n= n(), se = sd/sqrt(n))

# write.table(dfstatistic,file="~/Documents/barplot.txt",quote=F,col.name=T,row.names=F,sep = "\t")

接上- 开始画图 ### strongly burden by individual

### 将多个组织的命名方式都添加进去,便于排序画图使用
dataFile <- c("data/011_geneExpression/TissueHashMap.txt")
dfTissueHashMap <- read_tsv(dataFile) %>%
  rename(Tissue = Tissue_inOri)
## Rows: 35 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Tissue_inRHeader, Tissue_inOri, Intermediate_tissue, High_level_tissue
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
AoCase <- "StronglyBurden"

df <- dfstatistic %>%
  left_join(.,dfTissueHashMap,by="Tissue") %>%
  filter(SlightlyOrStrongly == AoCase) %>%
  mutate(High_level_tissue=factor(High_level_tissue,levels = c("roots","leaves/shoots","spike","grain"))) %>%
  arrange(High_level_tissue,-ave) ## 注意排序!!!

### 画图的时候根据数据框的顺序画出来
### 35 个组织的顺序定义
df$Tissue <- factor(df$Tissue,levels = df$Tissue)

colB <- c("#9a784c","#749b64","#e6b700","#b3b563") ## 4个组织的颜色

p <- df %>%
  ggplot(aes(x= Tissue, y=ave,fill = High_level_tissue)) +
  geom_bar(stat="identity", position=position_dodge()) +
  geom_errorbar(aes(ymin=ave-se, ymax=ave+se), width=.1,
                # color="gray",
                 position=position_dodge(.9))+
  # labs(x="",y= "Strongly burden in CDS")+
  labs(x="",y= "Mutation burden")+
  geom_hline(yintercept = 0.0095, color = "gray",linetype="dashed")+
  theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45))+
  scale_fill_manual(values = colB)+
  theme(plot.margin=unit(rep(1,4),'lines'))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7)
    ,legend.position = 'none'
    ,text = element_text(size = 8))

p

ggsave(p,filename = "~/Documents/stronglyBurden_in_Tissue.pdf",height = 3,width = 4.2)

### 查看每个组织有多少个基因含有burden值
q <- df %>%
  ggplot(aes(x= Tissue, y=n,fill = High_level_tissue)) +
   geom_bar(stat="identity", position=position_dodge()) +
  labs(x="",y= "Gene number with strongly burden")+
  geom_hline(yintercept = 0.0318, color = "gray",linetype="dashed")+
  theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45))+
  scale_fill_manual(values = colB)+
  theme(plot.margin=unit(rep(1,4),'lines'))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7)
    ,legend.position = 'none'
    ,text = element_text(size = 12))

q

1.14.1 slightly burden by individual

### 将多个组织的命名方式都添加进去,便于排序画图使用
dataFile <- c("data/011_geneExpression/TissueHashMap.txt")
dfTissueHashMap <- read.delim(dataFile) %>%
  rename(Tissue = Tissue_inOri)

AoCase <- "SlightlyBurden"

df <- dfstatistic %>%
  left_join(.,dfTissueHashMap,by="Tissue") %>%
  filter(SlightlyOrStrongly == AoCase) %>%
  mutate(High_level_tissue=factor(High_level_tissue,levels = c("roots","leaves/shoots","spike","grain"))) %>%
  arrange(High_level_tissue,-ave) ## 注意排序!!!

### 画图的时候根据数据框的顺序画出来
### 35 个组织的顺序定义
df$Tissue <- factor(df$Tissue,levels = df$Tissue)

colB <- c("#9a784c","#749b64","#e6b700","#b3b563") ## 4个组织的颜色

p <- df %>%
  ggplot(aes(x= Tissue, y=ave,fill = High_level_tissue)) +
   geom_bar(stat="identity", position=position_dodge()) +
  geom_errorbar(aes(ymin=ave-se, ymax=ave+se), width=.2,
                 position=position_dodge(.9))+
  labs(x="",y= "Slightly burden in CDS")+
  geom_hline(yintercept = 0.14, color = "gray",linetype="dashed")+
  theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45))+
  scale_fill_manual(values = colB)+
  theme(plot.margin=unit(rep(1,4),'lines'))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7)
    ,legend.position = 'none'
    ,text = element_text(size = 12))

p

ggsave(p,filename = "~/Documents/slightlyBurden_in_Tissue.pdf",height = 5,width = 7)

### 查看每个组织有多少个基因含有burden值
p <- df %>%
  ggplot(aes(x= Tissue, y=n,fill = High_level_tissue)) +
   geom_bar(stat="identity", position=position_dodge()) +
  labs(x="",y= "Gene number with strongly burden")+
  geom_hline(yintercept = 0.0318, color = "gray",linetype="dashed")+
  theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45))+
  scale_fill_manual(values = colB)+
  theme(plot.margin=unit(rep(1,4),'lines'))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7)
    ,legend.position = 'none'
    ,text = element_text(size = 12))

p

1.15 4.Number of tissues vs mutation burden

### 首先找到每个基因组织表达的宽度
df_tissueBreath <- read_tsv("data/011_geneExpression/002_tissueBreath/002_Azhurnaya_tissue_breadth_100Kgenet_1TPM_miniVersion.txt")
## Rows: 74453 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Chr, Gene
## dbl (3): Pos, Pos_scale, ExpressionBreadth
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_tissueBreath %>% 
  ggplot(aes(x=ExpressionBreadth))+
  geom_histogram()+
  theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### 其次找到每个基因的load值
df_burden1 <- read_tsv("data/fromDaxing/Burden_Expression/004_burdenMean_vs_expression/GeneBurden.averageIndividualBurden.txt.gz")
## Rows: 156026 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): geneName, SlightlyOrStrongly
## dbl (4): n, mean_burden, sd_burden, rsd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_burden1 %>% 
  ggplot(aes(x= log10(mean_burden)))+
  facet_wrap(~SlightlyOrStrongly)+
  geom_histogram()+
  theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 143480 rows containing non-finite values (stat_bin).

### 再次将load和组织表达宽度进行相关性分析
df_burden1_tissueBreath <- df_burden1 %>% 
  left_join(y = df_tissueBreath,by = c("geneName"="Gene"))

temp <- df_burden1_tissueBreath %>% 
  ###手动分割bin
  # mutate(bin=cut(ExpressionBreadth,breaks = c(0,2,4,6,10,15,20,23))) %>% 
  mutate(bin=cut_width(ExpressionBreadth,width = 2,boundary = 0)) %>%
  filter(!is.na(bin)) %>% 
  group_by(SlightlyOrStrongly,bin) %>% 
  count() 

### 颜色配置
colA <- colorRampPalette(brewer.pal(9,"Reds")[c(4,5,6,7,8,9)])(6)
colC <- rev(colorRampPalette(brewer.pal(9,"Reds")[c(4,5,6,7,8,9)])(5))
colB <- c(colA,colC)

p <- df_burden1_tissueBreath %>% 
  filter(SlightlyOrStrongly=="StronglyBurden") %>%
  # filter(mean_burden!=0,is.finite(mean_burden)) %>%
  # filter(is.finite(mean_burden)) %>
  # mutate(bin=cut(ExpressionBreadth,breaks = c(0,2,4,6,10,15,20,23))) %>% 
  mutate(bin=cut_width(ExpressionBreadth,width = 2,boundary = 0)) %>%
  filter(!is.na(bin)) %>% 
  ggplot(aes(x=bin, y=mean_burden,color=bin)) +
  stat_summary(fun.data = mean_se,size=0.8)+
  scale_fill_manual(values = colB)+
  # facet_grid(SlightlyOrStrongly~.,scales = "free")+
  labs(x="Tissue breadth of gene expression", y= "Strongly mutation load per gene")+
  scale_x_discrete(breaks=
                     c("[0,2]","(6,8]","(12,14]","(18,20]"),
                   labels=c("2","8","14","20"))+
  scale_color_manual(values = colB)+
  # coord_cartesian(ylim = c(0,0.09))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7),
    legend.position = 'none',
    text = element_text(size = 12))

p
## Warning: Removed 38287 rows containing non-finite values (stat_summary).

ggsave(plot = p, "~/Documents/strongly_vs_tissueBreath.pdf",height = 3,width = 4)
## Warning: Removed 38287 rows containing non-finite values (stat_summary).
### 颜色配置需要修改一下,根据变化的趋势
colC <- rev(colorRampPalette(brewer.pal(9,"Reds")[c(4,5,6,7,8,9)])(11))
colB <- c(colC)
p <- df_burden1_tissueBreath %>% 
  filter(SlightlyOrStrongly=="SlightlyBurden") %>%
  # filter(mean_burden!=0,is.finite(mean_burden)) %>%
  # filter(is.finite(mean_burden)) %>
  # mutate(bin=cut(ExpressionBreadth,breaks = c(0,2,4,6,10,15,20,23))) %>% 
  mutate(bin=cut_width(ExpressionBreadth,width = 2,boundary = 0)) %>%
  filter(!is.na(bin)) %>% 
  ggplot(aes(x=bin, y=mean_burden,color=bin)) +
  stat_summary(fun.data = mean_se,size=0.8)+
  scale_fill_manual(values = colB)+
  # facet_grid(SlightlyOrStrongly~.,scales = "free")+
  labs(x="Tissue breadth of gene expression", y= "Slightly mutation load per gene")+
  scale_x_discrete(breaks=
                     c("[0,2]","(6,8]","(12,14]","(18,20]"),
                   labels=c("2","8","14","20"))+
  scale_color_manual(values = colB)+
  # coord_cartesian(ylim = c(0,0.09))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7),
    legend.position = 'none',
    text = element_text(size = 12))

p
## Warning: Removed 47697 rows containing non-finite values (stat_summary).

ggsave(plot = p, "~/Documents/slightly_vs_tissueBreath.pdf",height = 3,width = 4)
## Warning: Removed 47697 rows containing non-finite values (stat_summary).
# p <- df_burden1_tissueBreath %>% 
#   filter(SlightlyOrStrongly=="StronglyBurden") %>%
#   filter(mean_burden!=0,is.finite(mean_burden)) %>%
#   # filter(is.finite(mean_burden)) %>% 
#   # mutate(bin=cut(ExpressionBreadth,breaks = c(0,2,4,6,10,15,20,23))) %>% 
#   mutate(bin=cut_width(ExpressionBreadth,width = 2,boundary = 0)) %>%
#   filter(!is.na(bin)) %>% 
#   ggplot(aes(x=bin, y=mean_burden,color=bin)) +
# # stat_boxplot(geom = "errorbar",width=0.05,position = position_dodge(1))+ # add error bar
# # geom_boxplot(aes(fill=bin),outlier.shape = NA) +
# # geom_point(position = position_jitter(width = 0.2), size=0.1)+
# # stat_summary(fun=median, geom="point", color="blue",
# #              position = position_dodge(1),
# #              size=0.1)+
#   # stat_summary(fun.data = mean_se,size=0.2)+
#   geom_violin(aes(fill=bin),position = position_dodge(0.8)) +
#   stat_summary(fun=mean, geom="point", color="black",position = position_dodge(0.8),size=0.5)+
#   scale_fill_manual(values = colB)+
#   # facet_grid(SlightlyOrStrongly~.,scales = "free")+
#   labs(x="Tissue breadth of gene expression", y= "Strongly mutation load per gene")+
#   scale_x_discrete(breaks=
#                      c("[0,2]","(6,8]","(12,14]","(18,20]"),
#                    labels=c("2","8","14","20"))+
#   scale_color_manual(values = colB)+
#   coord_cartesian(ylim = c(0,0.09))+
#   theme(
#     panel.background = element_blank(),
#     panel.border = element_rect(colour = "black", fill=NA, size=0.7),
#     legend.position = 'none',
#     text = element_text(size = 9))
# 
# p
# ggsave(plot = p, "~/Documents/Burden1_vs_TPM.pdf",height = 2,width = 2)


# ### 过滤方式不一样
# p <- df_burden1_tissueBreath %>% 
#   filter(SlightlyOrStrongly=="StronglyBurden") %>%
#   # filter(mean_burden!=0,is.finite(mean_burden)) %>%
#   filter(is.finite(mean_burden)) %>%
#   # mutate(bin=cut(ExpressionBreadth,breaks = c(0,2,4,6,10,15,20,23))) %>% 
#   mutate(bin=cut_width(ExpressionBreadth,width = 2,boundary = 0)) %>%
#   filter(!is.na(bin)) %>% 
#   ggplot(aes(x=bin, y=mean_burden,color=bin)) +
# # stat_boxplot(geom = "errorbar",width=0.05,position = position_dodge(1))+ # add error bar
# # geom_boxplot(aes(fill=bin),outlier.shape = NA) +
# # geom_point(position = position_jitter(width = 0.2), size=0.1)+
# # stat_summary(fun=median, geom="point", color="blue",
# #              position = position_dodge(1),
# #              size=0.1)+
#   # stat_summary(fun.data = mean_se,size=0.2)+
#   geom_violin(aes(fill=bin),position = position_dodge(0.8)) +
#   stat_summary(fun=mean, geom="point", color="black",position = position_dodge(0.8),size=0.5)+
#   scale_fill_manual(values = colB)+
#   # facet_grid(SlightlyOrStrongly~.,scales = "free")+
#   labs(x="Tissue breadth of gene expression", y= "Strongly mutation load per gene")+
#   scale_x_discrete(breaks=
#                      c("[0,2]","(6,8]","(12,14]","(18,20]"),
#                    labels=c("2","8","14","20"))+
#   scale_color_manual(values = colB)+
#   coord_cartesian(ylim = c(0,0.09))+
#   theme(
#     panel.background = element_blank(),
#     panel.border = element_rect(colour = "black", fill=NA, size=0.7),
#     legend.position = 'none',
#     text = element_text(size = 9))
# 
# p
# ggsave(plot = p, "~/Documents/Burden1_vs_TPM2.pdf",height = 2,width = 2)

1.15.0.1 ========================

1.16 del ratio whole genome scan

### step1:根据原来数据框,求出 2方面值,1:单位CDS的结果,2:有害突变和同义突变的比
# df <- read_tsv("data/006_Fig2/genomeScan/002_delVSsynOnChr_10000000window1000000step_addEffectiveCDSLength_addRecom.txt")

# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/004_annoDB/009_genomeScan_delvcSyn/002/001_delVSsynOnChr_20000000window5000000step_addEffectiveCDSLength.txt")

# df1 <- df %>%
#   mutate(SynFre=.$a001_synonymous/.$CDSLength,
#          NonsynFre=.$a002_nonsynonymous/.$CDSLength,
#          DelFre=.$a003_nonsynGERPandDerivedPPH2/.$CDSLength,
#          PPH2Fre=.$a004_nonsynDerivedPPH2/.$CDSLength,
#          GERPFre=.$a005_GERP_16way/.$CDSLength,
#          StopGainFre=.$a006_StopGain/.$CDSLength,
#          ### ratio
#          NonsynSynRatio=.$a002_nonsynonymous/.$a001_synonymous,
#          DelSynRatio=.$a003_nonsynGERPandDerivedPPH2/.$a001_synonymous,
#          PPH2SynRatio=.$a004_nonsynDerivedPPH2/.$a001_synonymous,
#          GERPSynRatio=.$a005_GERP_16way/.$a001_synonymous,
#          StopGainSynRatio=.$a006_StopGain/.$a001_synonymous
#          )

# write_tsv(df1,"data/006_Fig2/genomeScan/003_delVSsynOnChr_10000000window1000000step_addEffectiveCDSLength_addRecom.txt")

# write_tsv(df1,"/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/004_annoDB/009_genomeScan_delvcSyn/002/001_delVSsynOnChr_20000000window5000000step_addEffectiveCDSLength.txt")

### step2:根据step1的结果,求出 windowScan 滑窗后的数据框
# library(tidyverse)
# library(windowscanr)
# df1 <- read_tsv("data/006_Fig2/genomeScan/003_delVSsynOnChr_10000000window1000000step_addEffectiveCDSLength_addRecom.txt") 
# 
# df <- df1 %>% 
#   filter(!is.na(.$DelSynRatio),!is.na(.$RecombinationRate)) 
# 
# ### check del/syn 最大值和最小值
# max(df$DelSynRatio)
# min(df$DelSynRatio)
# max(df$RecombinationRate)
# min(df$RecombinationRate)
# 
# pos_win <- winScan(x = df, 
#                    groups = "CHROM", 
#                    position = "BIN_START_scale" , 
#                    values = c("NonsynSynRatio", 
#                               "DelSynRatio",
#                               "PPH2SynRatio",
#                               "GERPSynRatio",
#                               "StopGainSynRatio",
#                               "RecombinationRate"), 
#                    win_size = 5,
#                    win_step = 2,
#                    funs = c("mean", "sd"))
# 
# head(pos_win)
# df <- pos_win
# df3 <- mutate(df,Sub=substring(df$CHROM,2,2))
# df <- df3
# 
# write_tsv(df,"data/006_Fig2/genomeScan/004_delVSsynOnChr_windowScanbyRpackage.txt")

1.17 plot

df <- read_tsv("data/006_Fig2/genomeScan/004_delVSsynOnChr_windowScanbyRpackage.txt")
## Rows: 1020 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): CHROM, Sub
## dbl (21): win_start, win_end, win_mid, NonsynSynRatio_n, NonsynSynRatio_mean...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# rec <- data.frame(x1=c(0,9,28,50,80), x2=c(9,28,50,80,100), y1=c(-Inf,-Inf,-Inf,-Inf,-Inf), y2=c(Inf,Inf,Inf,Inf,Inf), region=c('R1','R2a','C','R2b','R3'))
colB <- c("#fffcd4","#1e3c84","#3ab4c4","#3ab4c4","#1e3c84")
p <- df %>% 
  ggplot()+
  # geom_rect(data=rec, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2,fill=region))+
  annotate(geom = "rect", xmin = 28.14,xmax = 53.08, ymin = -Inf, ymax = Inf, fill = "#fffcd4",alpha = 1)+
  geom_line(mapping=aes(x=df$win_start,y=df$DelSynRatio_mean,group=CHROM),col='#F4A460')+
  stat_smooth(mapping=aes(x=df$win_start,y=df$DelSynRatio_mean),col = "#d5311c",method = "loess",formula = y~x,span = 0.04,se=F,size=1) +
  # geom_point()+
  geom_line(mapping=aes(x=df$win_start,y=df$RecombinationRate_mean/2,group=CHROM),col='#56B4E9')+
  stat_smooth(df, mapping=aes(x=df$win_start,y=df$RecombinationRate_mean/2),col = "blue",method = "loess",formula = y~x,span = 0.04,se=F,size=1) +
    # geom_point(df_centromere,mapping = aes(x=x,y=y),color='black',size=0.5)+
  xlab("Scale position")+ylab("Del SNPs/syn SNPs") +
  theme(legend.position = 'none')+
  scale_fill_manual(values = colB)+
  scale_y_continuous(expand = c(0, 0),sec.axis = sec_axis(~.*2, name="Recombination rate"))+
  scale_x_continuous(expand = c(0, 0))+
  theme(
    # plot.margin=unit(rep(1,4),'lines'),
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7)
    ,legend.position = 'none'
    ,text = element_text(size = 9))
p
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.52
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.48
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 20.07
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.52
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.48
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 20.07

ggsave(p,filename = "~/Documents/test.pdf",height = 2,width = 3.6)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.52
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.48
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 20.07
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.52
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.48
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 20.07

1.18 nonsyn ratio

df <- read_tsv("data/006_Fig2/genomeScan/004_delVSsynOnChr_windowScanbyRpackage.txt")
## Rows: 1020 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): CHROM, Sub
## dbl (21): win_start, win_end, win_mid, NonsynSynRatio_n, NonsynSynRatio_mean...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# rec <- data.frame(x1=c(0,9,28,50,80), x2=c(9,28,50,80,100), y1=c(-Inf,-Inf,-Inf,-Inf,-Inf), y2=c(Inf,Inf,Inf,Inf,Inf), region=c('R1','R2a','C','R2b','R3'))
colB <- c("#fffcd4","#1e3c84","#3ab4c4","#3ab4c4","#1e3c84")
p <- df %>% 
  ggplot()+
  # geom_rect(data=rec, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2,fill=region))+
  annotate(geom = "rect", xmin = 28.14,xmax = 53.08, ymin = -Inf, ymax = Inf, fill = "#fffcd4",alpha = 1)+
  geom_line(mapping=aes(x=df$win_start,y=df$NonsynSynRatio_mean,group=CHROM),col='#F4A460')+
  stat_smooth(mapping=aes(x=df$win_start,y=df$NonsynSynRatio_mean),col = "#d5311c",method = "loess",formula = y~x,span = 0.04,se=F,size=1) +
  # geom_point()+
  geom_line(mapping=aes(x=df$win_start,y=df$RecombinationRate_mean*2,group=CHROM),col='#56B4E9')+
  stat_smooth(df, mapping=aes(x=df$win_start,y=df$RecombinationRate_mean*2),col = "blue",method = "loess",formula = y~x,span = 0.04,se=F,size=1) +
    # geom_point(df_centromere,mapping = aes(x=x,y=y),color='black',size=0.5)+
  xlab("Scale position")+ylab("Nonsynonymous SNPs/syn SNPs") +
  theme(legend.position = 'none')+
  scale_fill_manual(values = colB)+
  scale_y_continuous(expand = c(0, 0),sec.axis = sec_axis(~./2, name="Recombination rate"))+
  scale_x_continuous(expand = c(0, 0))+
  theme(
    # plot.margin=unit(rep(1,4),'lines'),
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7)
    ,legend.position = 'none'
    ,text = element_text(size = 9))
p
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.52
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.48
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 20.07
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.52
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.48
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 20.07

ggsave(p,filename = "~/Documents/test.pdf",height = 2,width = 3.6)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.52
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.48
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 20.07
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.52
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.48
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 20.07

1.19 VEP high impact

df <- read_tsv("data/006_Fig2/genomeScan/004_delVSsynOnChr_windowScanbyRpackage.txt")
## Rows: 1020 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): CHROM, Sub
## dbl (21): win_start, win_end, win_mid, NonsynSynRatio_n, NonsynSynRatio_mean...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# rec <- data.frame(x1=c(0,9,28,50,80), x2=c(9,28,50,80,100), y1=c(-Inf,-Inf,-Inf,-Inf,-Inf), y2=c(Inf,Inf,Inf,Inf,Inf), region=c('R1','R2a','C','R2b','R3'))
colB <- c("#fffcd4","#1e3c84","#3ab4c4","#3ab4c4","#1e3c84")
p <- df %>% 
  ggplot()+
  # geom_rect(data=rec, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2,fill=region))+
  annotate(geom = "rect", xmin = 28.14,xmax = 53.08, ymin = -Inf, ymax = Inf, fill = "#fffcd4",alpha = 1)+
  geom_line(mapping=aes(x=df$win_start,y=df$StopGainSynRatio_mean,group=CHROM),col='#F4A460')+
  stat_smooth(mapping=aes(x=df$win_start,y=df$StopGainSynRatio_mean),col = "#d5311c",method = "loess",formula = y~x,span = 0.04,se=F,size=1) +
  # geom_point()+
  geom_line(mapping=aes(x=df$win_start,y=df$RecombinationRate_mean/9,group=CHROM),col='#56B4E9')+
  stat_smooth(df, mapping=aes(x=df$win_start,y=df$RecombinationRate_mean/9),col = "blue",method = "loess",formula = y~x,span = 0.04,se=F,size=1) +
    # geom_point(df_centromere,mapping = aes(x=x,y=y),color='black',size=0.5)+
  xlab("Scale position")+ylab("High impact SNPs/syn SNPs") +
  theme(legend.position = 'none')+
  scale_fill_manual(values = colB)+
  scale_y_continuous(expand = c(0, 0),sec.axis = sec_axis(~.*9, name="Recombination rate"))+
  scale_x_continuous(expand = c(0, 0))+
  theme(
    # plot.margin=unit(rep(1,4),'lines'),
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.7)
    ,legend.position = 'none'
    ,text = element_text(size = 9))
p
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.52
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.48
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 20.07
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.52
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.48
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 20.07

ggsave(p,filename = "~/Documents/test.pdf",height = 2,width = 3.6)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.52
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.48
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 20.07
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.52
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.48
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 20.07

1.19.0.1 ============

1.20 Extended data

1.20.0.1 ============

1.21 SNP count in gene region

1.22 查看 coding 区有哪些类型及个数

df_test <- dftotal %>% 
  filter(Region=="CDS") %>% 
  group_by(Variant_type) %>% 
  count()

1.23 FigS:SNP count in gene region

UTR Synonymous Stop-loss Stop-gain Start-lost Nonsynonymous 以上几种类型的计数

### Goal: 将 Region列和Variant_type列柔和在一起,成为新的一列,即将UTR的信息加入分析
### 第一步,Region 列,是 UTR_3 UTR_5的,新列就命名为UTR,不是的,则按照Variant_type类型添加

dfUTR <- dftotal %>% 
  filter(Region == "UTR_3" | Region == "UTR_5") %>% 
  mutate(VariantGroup = "UTR")

dfnotUTR <- dftotal %>% 
  filter(Region == "CDS") %>% 
  mutate(VariantGroup = Variant_type)

dfnew <- rbind(dfUTR,dfnotUTR)

df_variantGroup<- dfnew %>% 
  filter(!is.na(VariantGroup)) %>%
  group_by(VariantGroup) %>% 
  summarise(n=n()) %>% 
  ungroup() %>% 
  mutate(fre=n/sum(n)) %>% 
  arrange(desc(VariantGroup))

# colB <- c("#a9a9a9","#e69f00","#871f12","#d5311c","#5a150c","#004680")
colB <- c("#e69f00","#871f12","#d5311c","#5a150c","#004680","#a9a9a9")

df_variantGroup$VariantGroup <- factor(df_variantGroup$VariantGroup,levels = c("NONSYNONYMOUS","START-LOST","STOP-GAIN", "STOP-LOSS","SYNONYMOUS","UTR"), labels = c("Nonsynonymous","Start-lost", "Stop-gain","Stop-loss","Synonymous","UTR"))
p <- df_variantGroup %>%
ggplot(aes(x=VariantGroup,y=n,fill=VariantGroup))+
  geom_bar(stat = 'identity',position = "stack")+
  # geom_text(aes(label=n),size=3,nudge_y = 1)+
    geom_text(aes(label=paste(n,"\n(",round(fre,3),")",sep = "")),size=3,
            position = position_dodge(0.9))+
  labs(x="", y="SNP count")+
  # scale_fill_brewer(palette = "Set2")+
  scale_fill_manual(values = colB)+
  # scale_fill_brewer(palette = "Blues")+
  theme_classic()+
  theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
  theme(legend.position = "none",plot.margin=unit(rep(1,4),'lines'),
          axis.line = element_line(size=0.4, colour = "black"),text = element_text(size = 12))
p

ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 4)

1.24 FigS: AABBDD DAF

1.24.0.1 AABBDD: keep all sites in segregation

AABBDD AABB DD 三个群体都必须有分离

###step1: add AAF to GeneDB
dfAAF <- read_tsv("data/015_AAF/002_aaf_byPloidy/002_AAF_byPloidy.txt.gz")
## Rows: 2672838 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (4): Chr, Pos, AAF_AABB_DD, AAF_AABBDD
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- dftotal %>% 
  # mutate(Sub=str_sub(Transcript, 9, 9)) %>%
  ### 定义load
  mutate(Gerp_16way=ifelse(is.na(Gerp_16way),-100,Gerp_16way)) %>%
  mutate(Derived_PolyPhen2_prob=ifelse(is.na(Derived_PolyPhen2_prob),-100,Derived_PolyPhen2_prob)) %>%
  mutate(LoadGroup=ifelse(Derived_PolyPhen2_prob>=0.5 & Variant_type %in% c("NONSYNONYMOUS") & Gerp_16way >=1.5, "Deleterious", ifelse(Variant_type=="SYNONYMOUS", "Synonymous", ifelse(Variant_type=="NONSYNONYMOUS", "Nonsynonymous", NA)))) %>% 
  filter(!is.na(LoadGroup)) %>% 
  filter(!Ancestral == "NA") %>% 
  filter(Ancestral==Ref|Ancestral==Alt) %>% 
  ### 将AAF数据代入其中
  left_join(.,dfAAF,by=c("Chr","Pos")) %>% 
  mutate(DAF_42 = ifelse(Ref==Ancestral,AAF_AABB_DD,1-AAF_AABB_DD)) %>% 
  mutate(DAF_06 = ifelse(Ref==Ancestral,AAF_AABBDD,1-AAF_AABBDD))

#### ======================================================================
### 必须在六倍体群体分离
df06 <- df %>% 
  mutate(Sub=factor(Sub,levels = c("A","B","D"),labels = c("A subgenome","B subgenome","D subgenome"))) %>% 
  select(Sub,LoadGroup,DAF_42,DAF_06) %>% 
  ### data deal
  filter(!is.na(DAF_06)) %>%  ###*****
  mutate(bin=cut_width(DAF_06,width = 0.05,boundary = 0)) %>% 
  group_by(Sub,LoadGroup) %>% 
  count(bin) %>% 
  mutate(fre=n/sum(n)) %>% 
  arrange(Sub,desc(LoadGroup)) 
 
#### facet by sub
colB <- rep(c("#d5311c","#e69f00","#004680"),2)
q <- df06 %>% 
  ggplot(aes(x=bin,y=fre,group=LoadGroup,fill=LoadGroup))+
  geom_bar(stat = 'identity',position = "dodge")+
  labs(y="Proportion",x="Derived allele frequency")+
  # scale_color_brewer(palette = "Set2")+
  scale_fill_manual(values = colB)+
  # scale_fill_brewer(palette = "Blues")+
  scale_y_continuous(limits = c(0,0.855),breaks = seq(0,0.8,0.2))+
  scale_x_discrete(labels=c("0","","","","0.2","","","","0.4","","","","0.6","","","","0.8","","","",""))+
  facet_grid(Sub~.)+
  theme_classic()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
      ,panel.background = element_blank()
      ,axis.line = element_line(size=0.3, colour = "black")
      ,legend.position = 'none'
      ,text = element_text(size = 9))
q

ggsave(q,filename = "~/Documents/DAF_06.pdf",height = 3.17,width = 3.17)


#### ======================================================================
### 必须在四二倍体群体分离
df42 <- df %>% 
  mutate(Sub=factor(Sub,levels = c("A","B","D"),labels = c("A subgenome","B subgenome","D subgenome"))) %>% 
  select(Sub,LoadGroup,DAF_42,DAF_06) %>% 
  ### data deal
  filter(!is.na(DAF_42)) %>%  ###*****
  mutate(bin=cut_width(DAF_42,width = 0.05,boundary = 0)) %>% 
  group_by(Sub,LoadGroup) %>% 
  count(bin) %>% 
  mutate(fre=n/sum(n)) %>% 
  arrange(Sub,desc(LoadGroup)) 
 
#### facet by sub
colB <- rep(c("#d5311c","#e69f00","#004680"),2)
q <- df42 %>% 
  ggplot(aes(x=bin,y=fre,group=LoadGroup,fill=LoadGroup))+
  geom_bar(stat = 'identity',position = "dodge")+
  labs(y="Proportion",x="Derived allele frequency")+
  # scale_color_brewer(palette = "Set2")+
  scale_fill_manual(values = colB)+
  # scale_fill_brewer(palette = "Blues")+
  scale_y_continuous(limits = c(0,0.855),breaks = seq(0,0.8,0.2))+
  scale_x_discrete(labels=c("0","","","","0.2","","","","0.4","","","","0.6","","","","0.8","","","",""))+
  facet_grid(Sub~.)+
  theme_classic()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
      ,panel.background = element_blank()
      ,axis.line = element_line(size=0.3, colour = "black")
      ,legend.position = 'none'
      ,text = element_text(size = 9))
q

ggsave(q,filename = "~/Documents/DAF_42.pdf",height = 3.17,width = 3.17)

1.24.1 test

test <- c("A practical implementation of this notion would be refining gene constraint models to incorporate
contributions from constrained regulatory elements. This essentially borrows power from extending the
functional unit of a gene in the context ofthe gene regulatory network. Such an approach would allow
Constraint modeling for specific tissue/cell types and conditions given the diverse range of epigenomics
data. Taking enhancer-gene links from brain tissue as a proof-of-principle, a simple logistic regression test
onconditioning on gene constraint indicated significant contribution from enhancer constraint for predicting
functionally relevant genes (e.g, targets of Fragile x Mental Retardation Protein (FMRP), Logit P=2.84*10-8
, Methods). While we acknowledge that the biological consequences of mutations in enhancers are not
clearly understood and thus natural selection may differ in its interest depending on mechanistic
consequence, an extended model to incorporate non-coding variation information in a biologically-
informed way hold the promise to provide a finer characterization of gene function and facilitate a better
understanding of the molecular mechanisms underiving selection.")

test2 <- str_replace_all(test,"\n"," ")

test2
## [1] "A practical implementation of this notion would be refining gene constraint models to incorporate contributions from constrained regulatory elements. This essentially borrows power from extending the functional unit of a gene in the context ofthe gene regulatory network. Such an approach would allow Constraint modeling for specific tissue/cell types and conditions given the diverse range of epigenomics data. Taking enhancer-gene links from brain tissue as a proof-of-principle, a simple logistic regression test onconditioning on gene constraint indicated significant contribution from enhancer constraint for predicting functionally relevant genes (e.g, targets of Fragile x Mental Retardation Protein (FMRP), Logit P=2.84*10-8 , Methods). While we acknowledge that the biological consequences of mutations in enhancers are not clearly understood and thus natural selection may differ in its interest depending on mechanistic consequence, an extended model to incorporate non-coding variation information in a biologically- informed way hold the promise to provide a finer characterization of gene function and facilitate a better understanding of the molecular mechanisms underiving selection."